Update libyuv to r1602 version to get best performance.
Bug: 29870647
Change-Id: I8ec9fab7f55765fa33ebe7ba1c7ad2147f418de2
diff --git a/files/source/compare.cc b/files/source/compare.cc
index bf4a7da..e3846bd 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -17,8 +17,10 @@
#endif
#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
+#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
@@ -26,363 +28,133 @@
#endif
// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
- uint32 hash = seed;
- for (int i = 0; i < count; ++i) {
- hash += (hash << 5) + src[i];
- }
- return hash;
-}
-
-// This module is for Visual C x86
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_HASHDJB2_SSE41
-static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-static const uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
-// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
-// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
-// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
-// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
- _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- __asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
- movd xmm0, [esp + 12] // seed
-
- pxor xmm7, xmm7 // constant 0 for unpck
- movdqa xmm6, kHash16x33
-
- align 16
- wloop:
- movdqu xmm1, [eax] // src[0-15]
- lea eax, [eax + 16]
- pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
- movdqa xmm5, kHashMul0
- movdqa xmm2, xmm1
- punpcklbw xmm2, xmm7 // src[0-7]
- movdqa xmm3, xmm2
- punpcklwd xmm3, xmm7 // src[0-3]
- pmulld(0xdd) // pmulld xmm3, xmm5
- movdqa xmm5, kHashMul1
- movdqa xmm4, xmm2
- punpckhwd xmm4, xmm7 // src[4-7]
- pmulld(0xe5) // pmulld xmm4, xmm5
- movdqa xmm5, kHashMul2
- punpckhbw xmm1, xmm7 // src[8-15]
- movdqa xmm2, xmm1
- punpcklwd xmm2, xmm7 // src[8-11]
- pmulld(0xd5) // pmulld xmm2, xmm5
- movdqa xmm5, kHashMul3
- punpckhwd xmm1, xmm7 // src[12-15]
- pmulld(0xcd) // pmulld xmm1, xmm5
- paddd xmm3, xmm4 // add 16 results
- paddd xmm1, xmm2
- sub ecx, 16
- paddd xmm1, xmm3
-
- pshufd xmm2, xmm1, 14 // upper 2 dwords
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 1
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- jg wloop
-
- movd eax, xmm0 // return hash
- ret
- }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-#define HAS_HASHDJB2_SSE41
-CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-CONST uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-CONST uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-CONST uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-CONST uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- uint32 hash;
- asm volatile (
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "sub $0x10,%1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
- return hash;
-}
-#endif // HAS_HASHDJB2_SSE41
-
-// hash seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
- uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+ const int kBlockSize = 1 << 15; // 32768;
+ int remainder;
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+ HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
}
#endif
+#if defined(HAS_HASHDJB2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HashDjb2_SSE = HashDjb2_AVX2;
+ }
+#endif
- const int kBlockSize = 1 << 15; // 32768;
- while (count >= static_cast<uint64>(kBlockSize)) {
+ while (count >= (uint64)(kBlockSize)) {
seed = HashDjb2_SSE(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
}
- int remainder = static_cast<int>(count) & ~15;
+ remainder = (int)(count) & ~15;
if (remainder) {
seed = HashDjb2_SSE(src, remainder, seed);
src += remainder;
count -= remainder;
}
- remainder = static_cast<int>(count) & 15;
+ remainder = (int)(count) & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
return seed;
}
-#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SUMSQUAREERROR_NEON
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-
-#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked) __declspec(align(16))
-static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
- int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- pxor xmm0, xmm0
- pxor xmm5, xmm5
- sub edx, eax
-
- align 16
- wloop:
- movdqa xmm1, [eax]
- movdqa xmm2, [eax + edx]
- lea eax, [eax + 16]
- sub ecx, 16
- movdqa xmm3, xmm1 // abs trick
- psubusb xmm1, xmm2
- psubusb xmm2, xmm3
- por xmm1, xmm2
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm0, xmm1
- paddd xmm0, xmm2
- jg wloop
-
- pshufd xmm1, xmm0, 0EEh
- paddd xmm0, xmm1
- pshufd xmm1, xmm0, 01h
- paddd xmm0, xmm1
- movd eax, xmm0
- ret
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
+ return FOURCC_BGRA;
+ }
+ if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ return FOURCC_ARGB;
+ }
+ argb += 8;
}
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
- int count) {
- uint32 sse;
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm1 \n"
- "movdqa (%0,%1,1),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "jg 1b \n"
-
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
-
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
- return sse;
-}
-#endif
-
-static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
- int count) {
- uint32 sse = 0u;
- for (int i = 0; i < count; ++i) {
- int diff = src_a[i] - src_b[i];
- sse += static_cast<uint32>(diff * diff);
+ if (width & 1) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
}
- return sse;
+ return 0;
}
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+ uint32 fourcc = 0;
+ int h;
+
+ // Coalesce rows.
+ if (stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ stride_argb = 0;
+ }
+ for (h = 0; h < height && fourcc == 0; ++h) {
+ fourcc = ARGBDetectRow_C(argb, width);
+ argb += stride_argb;
+ }
+ return fourcc;
+}
+
+// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
int count) {
+ // SumSquareError returns values 0 to 65535 for each squared difference.
+ // Up to 65536 of those can be summed and remain within a uint32.
+ // After each block of 65536 pixels, accumulate into a uint64.
+ const int kBlockSize = 65536;
+ int remainder = count & (kBlockSize - 1) & ~31;
+ uint64 sse = 0;
+ int i;
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
}
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
#endif
- // 32K values will fit a 32bit int return value from SumSquareError.
- // After each block of 32K, accumulate into 64 bit int.
- const int kBlockSize = 1 << 15; // 32768;
- uint64 sse = 0;
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ // Note only used for multiples of 32 so count is not checked.
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
#endif
- for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
}
src_a += count & ~(kBlockSize - 1);
src_b += count & ~(kBlockSize - 1);
- int remainder = count & (kBlockSize - 1) & ~15;
if (remainder) {
sse += SumSquareError(src_a, src_b, remainder);
src_a += remainder;
src_b += remainder;
}
- remainder = count & 15;
+ remainder = count & 31;
if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder);
}
@@ -393,27 +165,20 @@
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
- uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
- SumSquareError_C;
-#if defined(HAS_SUMSQUAREERROR_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SumSquareError = SumSquareError_NEON;
- }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
- IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
- SumSquareError = SumSquareError_SSE2;
- }
-#endif
-
uint64 sse = 0;
- for (int h = 0; h < height; ++h) {
- sse += SumSquareError(src_a, src_b, width);
+ int h;
+ // Coalesce rows.
+ if (stride_a == width &&
+ stride_b == width) {
+ width *= height;
+ height = 1;
+ stride_a = stride_b = 0;
+ }
+ for (h = 0; h < height; ++h) {
+ sse += ComputeSumSquareError(src_a, src_b, width);
src_a += stride_a;
src_b += stride_b;
}
-
return sse;
}
@@ -421,7 +186,7 @@
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
double psnr;
if (sse > 0) {
- double mse = static_cast<double>(count) / static_cast<double>(sse);
+ double mse = (double)(count) / (double)(sse);
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
psnr = kMaxPsnr; // Limit to prevent divide by 0
@@ -479,8 +244,10 @@
int64 sum_sq_b = 0;
int64 sum_axb = 0;
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ int j;
+ for (j = 0; j < 8; ++j) {
sum_a += src_a[j];
sum_b += src_b[j];
sum_sq_a += src_a[j] * src_a[j];
@@ -492,26 +259,29 @@
src_b += stride_b;
}
- const int64 count = 64;
- // scale the constants by number of pixels
- const int64 c1 = (cc1 * count * count) >> 12;
- const int64 c2 = (cc2 * count * count) >> 12;
+ {
+ const int64 count = 64;
+ // scale the constants by number of pixels
+ const int64 c1 = (cc1 * count * count) >> 12;
+ const int64 c2 = (cc2 * count * count) >> 12;
- const int64 sum_a_x_sum_b = sum_a * sum_b;
+ const int64 sum_a_x_sum_b = sum_a * sum_b;
- const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
- (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+ const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
- const int64 sum_a_sq = sum_a*sum_a;
- const int64 sum_b_sq = sum_b*sum_b;
+ const int64 sum_a_sq = sum_a*sum_a;
+ const int64 sum_b_sq = sum_b*sum_b;
- const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
- (count * sum_sq_a - sum_a_sq +
- count * sum_sq_b - sum_b_sq + c2);
+ const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq +
+ count * sum_sq_b - sum_b_sq + c2);
- if (ssim_d == 0.0)
- return DBL_MAX;
- return ssim_n * 1.0 / ssim_d;
+ if (ssim_d == 0.0) {
+ return DBL_MAX;
+ }
+ return ssim_n * 1.0 / ssim_d;
+ }
}
// We are using a 8x8 moving window with starting location of each 8x8 window
@@ -523,15 +293,14 @@
int width, int height) {
int samples = 0;
double ssim_total = 0;
-
double (*Ssim8x8)(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b);
-
- Ssim8x8 = Ssim8x8_C;
+ const uint8* src_b, int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
- for (int i = 0; i < height - 8; i += 4) {
- for (int j = 0; j < width - 8; j += 4) {
+ int i;
+ for (i = 0; i < height - 8; i += 4) {
+ int j;
+ for (j = 0; j < width - 8; j += 4) {
ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
samples++;
}
diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc
new file mode 100644
index 0000000..42fc589
--- /dev/null
+++ b/files/source/compare_common.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ int i;
+ for (i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += (uint32)(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ int i;
+ for (i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
new file mode 100644
index 0000000..1b83edb
--- /dev/null
+++ b/files/source/compare_gcc.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10, 1) ",%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+ return sse;
+}
+
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile (
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
index d8b375b..49aa3b4 100644
--- a/files/source/compare_neon.cc
+++ b/files/source/compare_neon.cc
@@ -4,18 +4,22 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
@@ -25,10 +29,11 @@
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
- ".p2align 2 \n"
"1: \n"
- "vld1.u8 {q0}, [%0]! \n"
- "vld1.u8 {q1}, [%1]! \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
@@ -53,10 +58,9 @@
return sse;
}
-#endif // __ARM_NEON__
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
new file mode 100644
index 0000000..f9c7df9
--- /dev/null
+++ b/files/source/compare_neon64.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
new file mode 100644
index 0000000..dc86fe2
--- /dev/null
+++ b/files/source/compare_win.cc
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+
+ wloop:
+ movdqu xmm1, [eax]
+ lea eax, [eax + 16]
+ movdqu xmm2, [edx]
+ lea edx, [edx + 16]
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ sub ecx, 16
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ sub ecx, 32
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, xmmword ptr kHash16x33
+
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ movdqa xmm5, xmmword ptr kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld xmm3, xmm5
+ movdqa xmm5, xmmword ptr kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld xmm4, xmm5
+ movdqa xmm5, xmmword ptr kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld xmm2, xmm5
+ movdqa xmm5, xmmword ptr kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ vmovd xmm0, [esp + 12] // seed
+
+ wloop:
+ vpmovzxbd xmm3, [eax] // src[0-3]
+ vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, [eax + 4] // src[4-7]
+ vpmulld xmm3, xmm3, xmmword ptr kHashMul0
+ vpmovzxbd xmm2, [eax + 8] // src[8-11]
+ vpmulld xmm4, xmm4, xmmword ptr kHashMul1
+ vpmovzxbd xmm1, [eax + 12] // src[12-15]
+ vpmulld xmm2, xmm2, xmmword ptr kHashMul2
+ lea eax, [eax + 16]
+ vpmulld xmm1, xmm1, xmmword ptr kHashMul3
+ vpaddd xmm3, xmm3, xmm4 // add 16 results
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm1, xmm1, xmm3
+ vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ vpaddd xmm1, xmm1,xmm2
+ vpshufd xmm2, xmm1, 0x01
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm0, xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ vmovd eax, xmm0 // return hash
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h
deleted file mode 100644
index ef3ebf3..0000000
--- a/files/source/conversion_tables.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-/**************************************************************
-* conversion_tables.h
-*
-* Pre-compiled definitions of the conversion equations: YUV -> RGB.
-*
-***************************************************************/
-
-#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
-#define LIBYUV_SOURCE_CONVERSION_TABLES_H_
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-/******************************************************************************
-* YUV TO RGB approximation
-*
-* R = clip( (298 * (Y - 16) + 409 * (V - 128) + 128 ) >> 8 )
-* G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 )
-* B = clip( (298 * (Y - 16) + 516 * (U - 128) + 128 ) >> 8 )
-*******************************************************************************/
-
- #define Yc(i) static_cast<int> ( 298 * ( i - 16 )) // Y contribution
- #define Ucg(i) static_cast<int> ( -100 * ( i - 128 ))// U contribution to G
- #define Ucb(i) static_cast<int> ( 516 * ( i - 128 ))// U contribution to B
- #define Vcr(i) static_cast<int> ( 409 * ( i - 128 ))// V contribution to R
- #define Vcg(i) static_cast<int> ( -208 * ( i - 128 ))// V contribution to G
-
- static const int mapYc[256] = {
- Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9),
- Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19),
- Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29),
- Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39),
- Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49),
- Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59),
- Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69),
- Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79),
- Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89),
- Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99),
- Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108),
- Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117),
- Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126),
- Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135),
- Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144),
- Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153),
- Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162),
- Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171),
- Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180),
- Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189),
- Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198),
- Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207),
- Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216),
- Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225),
- Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234),
- Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243),
- Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252),
- Yc(253),Yc(254),Yc(255)};
-
- static const int mapUcg[256] = {
- Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9),
- Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18),
- Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27),
- Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36),
- Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45),
- Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54),
- Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63),
- Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72),
- Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81),
- Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90),
- Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99),
- Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107),
- Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115),
- Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123),
- Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131),
- Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139),
- Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147),
- Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155),
- Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163),
- Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171),
- Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179),
- Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187),
- Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195),
- Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203),
- Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211),
- Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219),
- Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227),
- Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235),
- Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243),
- Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251),
- Ucg(252),Ucg(253),Ucg(254),Ucg(255)};
-
- static const int mapUcb[256] = {
- Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9),
- Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18),
- Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27),
- Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36),
- Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45),
- Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54),
- Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63),
- Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72),
- Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81),
- Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90),
- Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99),
- Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107),
- Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115),
- Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123),
- Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131),
- Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139),
- Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147),
- Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155),
- Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163),
- Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171),
- Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179),
- Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187),
- Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195),
- Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203),
- Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211),
- Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219),
- Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227),
- Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235),
- Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243),
- Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251),
- Ucb(252),Ucb(253),Ucb(254),Ucb(255)};
-
- static const int mapVcr[256] = {
- Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9),
- Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18),
- Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27),
- Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36),
- Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45),
- Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54),
- Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63),
- Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72),
- Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81),
- Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90),
- Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99),
- Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107),
- Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115),
- Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123),
- Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131),
- Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139),
- Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147),
- Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155),
- Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163),
- Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171),
- Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179),
- Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187),
- Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195),
- Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203),
- Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211),
- Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219),
- Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227),
- Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235),
- Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243),
- Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251),
- Vcr(252),Vcr(253),Vcr(254),Vcr(255)};
-
-
- static const int mapVcg[256] = {
- Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9),
- Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18),
- Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27),
- Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36),
- Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45),
- Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54),
- Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63),
- Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72),
- Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81),
- Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90),
- Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99),
- Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107),
- Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115),
- Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123),
- Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131),
- Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139),
- Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147),
- Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155),
- Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163),
- Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171),
- Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179),
- Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187),
- Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195),
- Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203),
- Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211),
- Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219),
- Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227),
- Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235),
- Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243),
- Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
- Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif
-
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 0882c92..e332bc5 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -12,13 +12,9 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
-#include "libyuv/video_common.h"
+#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -26,7 +22,43 @@
extern "C" {
#endif
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int src_uv_width, int src_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 ||
+ src_uv_width == 0 || src_uv_height == 0) {
+ return -1;
+ }
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
LIBYUV_API
int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -35,6 +67,8 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -43,7 +77,7 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- int halfheight = (height + 1) >> 1;
+ halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -52,76 +86,17 @@
src_stride_v = -src_stride_v;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+ // Copy UV planes.
CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
return 0;
}
-// Move to row_win etc.
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_HALFROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // src_uv_stride
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- sub edi, eax
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- pavgb xmm0, [eax + edx]
- sub ecx, 16
- movdqa [eax + edi], xmm0
- lea eax, [eax + 16]
- jg convertloop
- pop edi
- ret
- }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_HALFROW_SSE2
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- asm volatile (
- "sub %0,%1 \n"
- ".p2align 4 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pavgb (%0,%3),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%0,%1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_uv), // %1
- "+r"(pix) // %2
- : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0"
-#endif
-);
-}
-#endif
-
-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- for (int x = 0; x < pix; ++x) {
- dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
- }
-}
-
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -130,78 +105,19 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) = HalfRow_C;
-#if defined(HAS_HALFROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(halfwidth, 16) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- HalfRow = HalfRow_SSE2;
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // SubSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- HalfRow(src_u, src_stride_u, dst_u, halfwidth);
- src_u += src_stride_u * 2;
- dst_u += dst_stride_u;
- }
- if (height & 1) {
- HalfRow(src_u, 0, dst_u, halfwidth);
- }
-
- // SubSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- HalfRow(src_v, src_stride_v, dst_v, halfwidth);
- src_v += src_stride_v * 2;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- HalfRow(src_v, 0, dst_v, halfwidth);
- }
- return 0;
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
}
-// Blends 32x2 pixels to 16x1
-// source in scale.cc
-#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-#elif !defined(YUV_DISABLE_ASM) && \
- (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-#endif
-void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
int I444ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -210,74 +126,16 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(halfwidth, 16)) {
- ScaleRowDown2 = ScaleRowDown2Int_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(halfwidth, 16) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- ScaleRowDown2 = ScaleRowDown2Int_SSE2;
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // SubSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
- src_u += src_stride_u * 2;
- dst_u += dst_stride_u;
- }
- if (height & 1) {
- ScaleRowDown2(src_u, 0, dst_u, halfwidth);
- }
-
- // SubSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
- src_v += src_stride_v * 2;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ScaleRowDown2(src_v, 0, dst_v, halfwidth);
- }
- return 0;
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ width, height);
}
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr);
-
// 411 chroma is 1/4 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
@@ -288,45 +146,15 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- int quarterwidth = (width + 3) >> 2;
-
- // Resample U plane.
- ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
- halfwidth, halfheight, // to 1/2 width, 1/2 height
- src_stride_u,
- dst_stride_u,
- src_u, dst_u);
-
- // Resample V plane.
- ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
- halfwidth, halfheight, // to 1/2 width, 1/2 height
- src_stride_v,
- dst_stride_v,
- src_v, dst_v);
- return 0;
+ const int src_uv_width = SUBSAMPLE(width, 3, 2);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
}
// I400 is greyscale typically used in MJPG
@@ -336,6 +164,8 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
if (!src_y || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1;
@@ -343,11 +173,10 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
+ halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
@@ -355,33 +184,42 @@
}
static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
- uint8* dst, int dst_stride_frame,
+ uint8* dst, int dst_stride,
int width, int height) {
+ int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
- CopyRow = CopyRow_NEON;
- }
-#elif defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
- IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
- CopyRow = CopyRow_SSE2;
- }
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
}
#endif
// Copy plane
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
+ CopyRow(src + src_stride_0, dst + dst_stride, width);
src += src_stride_0 + src_stride_1;
- dst += dst_stride_frame * 2;
+ dst += dst_stride * 2;
}
if (height & 1) {
CopyRow(src, dst, width);
@@ -404,6 +242,11 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
if (!src_y || !src_uv ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -412,7 +255,7 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- int halfheight = (height + 1) >> 1;
+ halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
@@ -420,33 +263,70 @@
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
-
- int halfwidth = (width + 1) >> 1;
- void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
- SplitUV_C;
-#if defined(HAS_SPLITUV_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
- SplitUV = SplitUV_NEON;
+ // Coalesce rows.
+ if (src_stride_y0 == width &&
+ src_stride_y1 == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
}
-#elif defined(HAS_SPLITUV_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(halfwidth, 16) &&
- IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- SplitUV = SplitUV_SSE2;
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 &&
+ dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+ IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+ IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+ SplitUVRow = SplitUVRow_Any_DSPR2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_DSPR2;
+ }
}
#endif
if (dst_y) {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
+ if (src_stride_y0 == src_stride_y1) {
+ CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+ } else {
+ CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+ width, height);
+ }
}
- int halfheight = (height + 1) >> 1;
- for (int y = 0; y < halfheight; ++y) {
+ for (y = 0; y < halfheight; ++y) {
// Copy a row of UV.
- SplitUV(src_uv, dst_u, dst_v, halfwidth);
+ SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
dst_u += dst_stride_u;
dst_v += dst_stride_v;
src_uv += src_stride_uv;
@@ -470,6 +350,22 @@
width, height);
}
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_vu, src_stride_vu,
+ dst_y, dst_stride_y,
+ dst_v, dst_stride_v,
+ dst_u, dst_stride_u,
+ width, height);
+}
+
// Convert M420 to I420.
LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420,
@@ -485,159 +381,6 @@
width, height);
}
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (!src_y || !src_yuy2 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- int halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-
- void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) = YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
- YUY2ToYRow_C;
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
- }
- }
-#elif defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
- }
- if (IS_ALIGNED(width, 16)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- YUY2ToUV422Row = YUY2ToUV422Row_NEON;
- }
- }
-#endif
-
- for (int y = 0; y < height - 1; y += 2) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
-
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- CopyRow(src_y, dst_y, width);
- YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
- }
- return 0;
-}
-
-// Test if over reading on source is safe.
-// TODO(fbarchard): Find more efficient solution to safely do odd sizes.
-// Macros to control read policy, from slowest to fastest:
-// READSAFE_NEVER - disables read ahead on systems with strict memory reads
-// READSAFE_ODDHEIGHT - last row of odd height done with C.
-// This policy assumes that the caller handles the last row of an odd height
-// image using C.
-// READSAFE_PAGE - enable read ahead within same page.
-// A page is 4096 bytes. When reading ahead, if the last pixel is near the
-// end the page, and a read spans the page into the next page, a memory
-// exception can occur if that page has not been allocated, or is a guard
-// page. This setting ensures the overread is within the same page.
-// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
-// or where buffers are padded by 64 bytes.
-
-#if defined(HAS_RGB24TOARGBROW_SSSE3) || \
- defined(HAS_RGB24TOARGBROW_SSSE3) || \
- defined(HAS_RAWTOARGBROW_SSSE3) || \
- defined(HAS_RGB565TOARGBROW_SSE2) || \
- defined(HAS_ARGB1555TOARGBROW_SSE2) || \
- defined(HAS_ARGB4444TOARGBROW_SSE2)
-
-#define READSAFE_ODDHEIGHT
-
-static bool TestReadSafe(const uint8* src_yuy2, int src_stride_yuy2,
- int width, int height, int bpp, int overread) {
- if (width > kMaxStride) {
- return false;
- }
-#if defined(READSAFE_ALWAYS)
- return true;
-#elif defined(READSAFE_NEVER)
- return false;
-#elif defined(READSAFE_ODDHEIGHT)
- if (!(width & 15) ||
- (src_stride_yuy2 >= 0 && (height & 1) && width * bpp >= overread)) {
- return true;
- }
- return false;
-#elif defined(READSAFE_PAGE)
- if (src_stride_yuy2 >= 0) {
- src_yuy2 += (height - 1) * src_stride_yuy2;
- }
- uintptr_t last_adr = (uintptr_t)(src_yuy2) + width * bpp - 1;
- uintptr_t last_read_adr = last_adr + overread - 1;
- if (((last_adr ^ last_read_adr) & ~4095) == 0) {
- return true;
- }
- return false;
-#endif
-}
-#endif
-
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
@@ -645,43 +388,41 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int width) = YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
- void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int pix);
- YUY2ToYRow = YUY2ToYRow_C;
- YUY2ToUVRow = YUY2ToUVRow_C;
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- }
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
-#elif defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width > 16) {
- YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
- }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
}
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUVRow = YUY2ToUVRow_NEON;
@@ -689,7 +430,7 @@
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
@@ -712,43 +453,41 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int width) = UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
- void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix);
- UYVYToYRow = UYVYToYRow_C;
- UYVYToUVRow = UYVYToUVRow_C;
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- UYVYToUVRow = UYVYToUVRow_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- }
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
-#elif defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width > 16) {
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
}
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUVRow = UYVYToUVRow_NEON;
@@ -756,7 +495,7 @@
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
@@ -772,162 +511,18 @@
return 0;
}
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
- defined(__i386__) || defined(_M_IX86) || \
- defined(__arm__) || defined(_M_ARM) || \
- (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define READWORD(p) (*reinterpret_cast<const uint32*>(p))
-#else
-static inline uint32 READWORD(const uint8* p) {
- return static_cast<uint32>(p[0]) |
- (static_cast<uint32>(p[1]) << 8) |
- (static_cast<uint32>(p[2]) << 16) |
- (static_cast<uint32>(p[3]) << 24);
-}
-#endif
-
-// Must be multiple of 6 pixels. Will over convert to handle remainder.
-// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
-static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
- for (int x = 0; x < width; x += 6) {
- uint32 w = READWORD(src_v210 + 0);
- dst_uyvy[0] = (w >> 2) & 0xff;
- dst_uyvy[1] = (w >> 12) & 0xff;
- dst_uyvy[2] = (w >> 22) & 0xff;
-
- w = READWORD(src_v210 + 4);
- dst_uyvy[3] = (w >> 2) & 0xff;
- dst_uyvy[4] = (w >> 12) & 0xff;
- dst_uyvy[5] = (w >> 22) & 0xff;
-
- w = READWORD(src_v210 + 8);
- dst_uyvy[6] = (w >> 2) & 0xff;
- dst_uyvy[7] = (w >> 12) & 0xff;
- dst_uyvy[8] = (w >> 22) & 0xff;
-
- w = READWORD(src_v210 + 12);
- dst_uyvy[9] = (w >> 2) & 0xff;
- dst_uyvy[10] = (w >> 12) & 0xff;
- dst_uyvy[11] = (w >> 22) & 0xff;
-
- src_v210 += 16;
- dst_uyvy += 12;
- }
-}
-
-// Convert V210 to I420.
-// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
-// With is multiple of 48.
-LIBYUV_API
-int V210ToI420(const uint8* src_v210, int src_stride_v210,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (width * 2 * 2 > kMaxStride) { // 2 rows of UYVY are required.
- return -1;
- } else if (!src_v210 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_v210 = src_v210 + (height - 1) * src_stride_v210;
- src_stride_v210 = -src_stride_v210;
- }
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*V210ToUYVYRow)(const uint8* src_v210, uint8* dst_uyvy, int pix);
- V210ToUYVYRow = V210ToUYVYRow_C;
-
- void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix);
- UYVYToYRow = UYVYToYRow_C;
- UYVYToUVRow = UYVYToUVRow_C;
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
-#elif defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width > 16) {
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- }
- }
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_NEON;
- UYVYToUVRow = UYVYToUVRow_NEON;
- }
- }
-#endif
-
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- UYVYToUVRow = UYVYToUVRow_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- }
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- UYVYToUVRow = UYVYToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
- }
-#elif defined(HAS_UYVYTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width > 16) {
- UYVYToUVRow = UYVYToUVRow_Any_NEON;
- }
- }
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_NEON;
- UYVYToUVRow = UYVYToUVRow_NEON;
- }
- }
-#endif
-
- for (int y = 0; y < height - 1; y += 2) {
- V210ToUYVYRow(src_v210, row, width);
- V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width);
- UYVYToUVRow(row, kMaxStride, dst_u, dst_v, width);
- UYVYToYRow(row, dst_y, width);
- UYVYToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_v210 += src_stride_v210 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- V210ToUYVYRow(src_v210, row, width);
- UYVYToUVRow(row, 0, dst_u, dst_v, width);
- UYVYToYRow(row, dst_y, width);
- }
- return 0;
-}
-
+// Convert ARGB to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -939,32 +534,44 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- }
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
@@ -980,12 +587,18 @@
return 0;
}
+// Convert BGRA to I420.
LIBYUV_API
int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+ BGRAToYRow_C;
if (!src_bgra ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -997,32 +610,34 @@
src_bgra = src_bgra + (height - 1) * src_stride_bgra;
src_stride_bgra = -src_stride_bgra;
}
- void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix);
- void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-
- BGRAToYRow = BGRAToYRow_C;
- BGRAToUVRow = BGRAToUVRow_C;
-#if defined(HAS_BGRATOYROW_SSSE3)
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
- BGRAToYRow = BGRAToYRow_Any_SSSE3;
- }
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
- BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
- BGRAToUVRow = BGRAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- BGRAToYRow = BGRAToYRow_SSSE3;
- }
- }
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ BGRAToYRow = BGRAToYRow_SSSE3;
}
}
#endif
+#if defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
BGRAToYRow(src_bgra, dst_y, width);
BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
@@ -1038,12 +653,18 @@
return 0;
}
+// Convert ABGR to I420.
LIBYUV_API
int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+ ABGRToYRow_C;
if (!src_abgr ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1055,32 +676,34 @@
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
- void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix);
- void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-
- ABGRToYRow = ABGRToYRow_C;
- ABGRToUVRow = ABGRToUVRow_C;
-#if defined(HAS_ABGRTOYROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
- ABGRToYRow = ABGRToYRow_Any_SSSE3;
- }
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
- ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ABGRToYRow = ABGRToYRow_SSSE3;
- }
- }
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
}
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
ABGRToYRow(src_abgr, dst_y, width);
ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
@@ -1096,12 +719,18 @@
return 0;
}
+// Convert RGBA to I420.
LIBYUV_API
int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+ RGBAToYRow_C;
if (!src_rgba ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1113,32 +742,34 @@
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
- void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix);
- void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-
- RGBAToYRow = RGBAToYRow_C;
- RGBAToUVRow = RGBAToUVRow_C;
-#if defined(HAS_RGBATOYROW_SSSE3)
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
- RGBAToYRow = RGBAToYRow_Any_SSSE3;
- }
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
- RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- RGBAToYRow = RGBAToYRow_SSSE3;
- }
- }
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
}
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
RGBAToYRow(src_rgba, dst_y, width);
RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
@@ -1154,18 +785,30 @@
return 0;
}
+// Convert RGB24 to I420.
LIBYUV_API
int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (width * 4 > kMaxStride) { // Row buffer is required.
+ int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+ void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+ RGB24ToYRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
- } else if (!src_rgb24 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
}
// Negative height means invert the image.
if (height < 0) {
@@ -1173,70 +816,113 @@
src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
src_stride_rgb24 = -src_stride_rgb24;
}
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
- RGB24ToARGBRow = RGB24ToARGBRow_C;
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
-#endif
-
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- }
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
}
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (int y = 0; y < height - 1; y += 2) {
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB24TOYROW_NEON)
+ free_aligned_buffer_64(row);
}
- if (height & 1) {
- RGB24ToARGBRow_C(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
+#endif
return 0;
}
+// Convert RAW to I420.
LIBYUV_API
int RAWToI420(const uint8* src_raw, int src_stride_raw,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (width * 4 > kMaxStride) { // Row buffer is required.
+ int y;
+#if defined(HAS_RAWTOYROW_NEON)
+ void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+ RAWToYRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
- } else if (!src_raw ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
}
// Negative height means invert the image.
if (height < 0) {
@@ -1244,69 +930,112 @@
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
- RAWToARGBRow = RAWToARGBRow_C;
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
-#endif
-
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- }
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
}
}
}
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (int y = 0; y < height - 1; y += 2) {
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYROW_NEON)
+ free_aligned_buffer_64(row);
}
- if (height & 1) {
- RAWToARGBRow_C(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
+#endif
return 0;
}
+// Convert RGB565 to I420.
LIBYUV_API
int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (width * 4 > kMaxStride) { // Row buffer is required.
- return -1;
- } else if (!src_rgb565 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+ void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+ RGB565ToYRow_C;
+#else
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1315,70 +1044,121 @@
src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
src_stride_rgb565 = -src_stride_rgb565;
}
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
- RGB565ToARGBRow = RGB565ToARGBRow_C;
-#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
-#endif
-
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- }
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
}
}
}
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (int y = 0; y < height - 1; y += 2) {
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB565TOYROW_NEON)
+ free_aligned_buffer_64(row);
}
- if (height & 1) {
- RGB565ToARGBRow_C(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
+#endif
return 0;
}
+// Convert ARGB1555 to I420.
LIBYUV_API
int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (width * 4 > kMaxStride) { // Row buffer is required.
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
+ ARGB1555ToYRow_C;
+#else
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
- } else if (!src_argb1555 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
}
// Negative height means invert the image.
if (height < 0) {
@@ -1386,71 +1166,123 @@
src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
src_stride_argb1555 = -src_stride_argb1555;
}
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_C;
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- TestReadSafe(src_argb1555, src_stride_argb1555, width, height, 2, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
-#endif
-
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- }
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
}
}
}
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (int y = 0; y < height - 1; y += 2) {
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555,
- row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+ free_aligned_buffer_64(row);
}
- if (height & 1) {
- ARGB1555ToARGBRow_C(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
+#endif
return 0;
}
+// Convert ARGB4444 to I420.
LIBYUV_API
int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (width * 4 > kMaxStride) { // Row buffer is required.
+ int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
+ ARGB4444ToYRow_C;
+#else
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
- } else if (!src_argb4444 ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
}
// Negative height means invert the image.
if (height < 0) {
@@ -1458,621 +1290,97 @@
src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
src_stride_argb4444 = -src_stride_argb4444;
}
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_C;
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- TestReadSafe(src_argb4444, src_stride_argb4444, width, height, 2, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
}
#endif
-
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
- ARGBToYRow = ARGBToYRow_C;
- ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-
- for (int y = 0; y < height - 1; y += 2) {
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444,
- row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ARGB4444ToARGBRow_C(src_argb4444, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
- return 0;
-}
-
-#ifdef HAVE_JPEG
-struct I420Buffers {
- uint8* y;
- int y_stride;
- uint8* u;
- int u_stride;
- uint8* v;
- int v_stride;
- int w;
- int h;
-};
-
-static void JpegCopyI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = static_cast<I420Buffers*>(opaque);
- I420Copy(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI422ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = static_cast<I420Buffers*>(opaque);
- I422ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI444ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = static_cast<I420Buffers*>(opaque);
- I444ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = static_cast<I420Buffers*>(opaque);
- I411ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI400ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = static_cast<I420Buffers*>(opaque);
- I400ToI420(data[0], strides[0],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToI420(const uint8* sample,
- size_t sample_size,
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int w, int h,
- int dw, int dh) {
- if (sample_size == kUnknownDataSize) {
- // ERROR: MJPEG frame size unknown
- return -1;
- }
-
- // TODO(fbarchard): Port to C
- MJpegDecoder mjpeg_decoder;
- bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
- // ERROR: MJPEG frame has unexpected dimensions
- mjpeg_decoder.UnloadFrame();
- return 1; // runtime failure
- }
- if (ret) {
- I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
- // YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 2 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
- // YUV422
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
- // YUV444
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
- // YUV400
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceGrayscale &&
- mjpeg_decoder.GetNumComponents() == 1 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
- } else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
- // ERROR: Unable to convert MJPEG frame because format is not supported
- mjpeg_decoder.UnloadFrame();
- return 1;
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
}
}
- return 0;
-}
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-// With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8* sample,
-#ifdef HAVE_JPEG
- size_t sample_size,
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
#else
- size_t /* sample_size */,
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int dst_width, int dst_height,
- RotationMode rotation,
- uint32 format) {
- if (!y || !u || !v || !sample ||
- src_width <= 0 || dst_width <= 0 ||
- src_height == 0 || dst_height == 0) {
- return -1;
- }
- int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
- int abs_src_height = (src_height < 0) ? -src_height : src_height;
- int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
- if (src_height < 0) {
- inv_dst_height = -inv_dst_height;
- }
- int r = 0;
-
- // One pass rotation is available for some formats. For the rest, convert
- // to I420 (with optional vertical flipping) into a temporary I420 buffer,
- // and then rotate the I420 to the final destination buffer.
- // For in-place conversion, if destination y is same as source sample,
- // also enable temporary buffer.
- bool need_buf = (rotation && format != FOURCC_I420 &&
- format != FOURCC_NV12 && format != FOURCC_NV21 &&
- format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
- uint8* tmp_y = y;
- uint8* tmp_u = u;
- uint8* tmp_v = v;
- int tmp_y_stride = y_stride;
- int tmp_u_stride = u_stride;
- int tmp_v_stride = v_stride;
- uint8* buf = NULL;
- int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
- if (need_buf) {
- int y_size = dst_width * abs_dst_height;
- int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
- buf = new uint8[y_size + uv_size * 2];
- if (!buf) {
- return 1; // Out of memory runtime error.
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
}
- y = buf;
- u = y + y_size;
- v = u + uv_size;
- y_stride = dst_width;
- u_stride = v_stride = ((dst_width + 1) / 2);
- }
-
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_UYVY:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_V210:
- // stride is multiple of 48 pixels (128 bytes).
- // pixels come in groups of 6 = 16 bytes
- src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
- crop_x / 6 * 16;
- r = V210ToI420(src, (aligned_src_width + 47) / 48 * 128,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_24BG:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RAW:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_ARGB:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_BGRA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_ABGR:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBP:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBO:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_R444:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_I400:
- src = sample + src_width * crop_y + crop_x;
- r = I400ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
-
- // Biplanar formats
- case FOURCC_NV12:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height, rotation);
- break;
- case FOURCC_NV21:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- // Call NV12 but with u and v parameters swapped.
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- v, v_stride,
- u, u_stride,
- dst_width, inv_dst_height, rotation);
- break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_Q420:
- src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
- src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
- src_width + crop_x * 2;
- r = Q420ToI420(src, src_width * 3,
- src_uv, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- // Triplanar formats
- case FOURCC_I420:
- case FOURCC_YU12:
- case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- int halfheight = (abs_src_height + 1) / 2;
- if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- }
- r = I420Rotate(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height, rotation);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- }
- r = I422ToI420(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- if (format == FOURCC_YV24) {
- src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- } else {
- src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- }
- r = I444ToI420(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToI420(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_width, inv_dst_height);
- break;
- }
-#ifdef HAVE_JPEG
- case FOURCC_MJPG:
- r = MJPGToI420(sample, sample_size,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- src_width, abs_src_height, dst_width, inv_dst_height);
- break;
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- default:
- r = -1; // unknown fourcc - return failure code.
- }
-
- if (need_buf) {
- if (!r) {
- r = I420Rotate(y, y_stride,
- u, u_stride,
- v, v_stride,
- tmp_y, tmp_y_stride,
- tmp_u, tmp_u_stride,
- tmp_v, tmp_v_stride,
- dst_width, abs_dst_height, rotation);
}
- delete buf;
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ free_aligned_buffer_64(row);
}
-
- return r;
+#endif
+ return 0;
}
#ifdef __cplusplus
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index 1c5aa9d..fb9582d 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -4,22 +4,20 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/convert_argb.h"
-#include <string.h> // for memset()
-
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
+#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
#include "libyuv/rotate_argb.h"
-#include "libyuv/video_common.h"
#include "libyuv/row.h"
+#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
@@ -47,13 +45,180 @@
return 0;
}
-// Convert I444 to ARGB.
+// Convert I422 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants,
+ width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvH709Constants,
+ width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix
+static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
@@ -65,25 +230,51 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*I444ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I444ToARGBRow_C;
-#if defined(HAS_I444TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
- for (int y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -99,6 +290,103 @@
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants,
+ width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvH709Constants,
+ width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
@@ -110,32 +398,42 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u == width &&
+ src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I444ToARGBRow = I444ToARGBRow_NEON;
}
}
#endif
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -144,6 +442,51 @@
return 0;
}
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants,
+ width, height);
+}
+
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -151,6 +494,13 @@
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*I411ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I411ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
@@ -162,25 +512,42 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*I411ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I411ToARGBRow_C;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 4 == width &&
+ src_stride_v * 4 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
#if defined(HAS_I411TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I411ToARGBRow = I411ToARGBRow_SSSE3;
- }
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I411ToARGBRow = I411ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I411ToARGBRow = I411ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_NEON;
}
}
#endif
- for (int y = 0; y < height; ++y) {
- I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -189,12 +556,152 @@
return 0;
}
+// Convert I420 with Alpha to preattenuated ARGB.
+static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ const uint8* src_a, int src_stride_a,
+ uint8* dst_argb, int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width, int height, int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ const uint8* src_a, int src_stride_a,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_a, src_stride_a,
+ dst_argb, dst_stride_argb,
+ &kYuvI601Constants,
+ width, height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ const uint8* src_a, int src_stride_a,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height, int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ src_a, src_stride_a,
+ dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
// Convert I400 to ARGB.
LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) = I400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -205,30 +712,54 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*YToARGBRow)(const uint8* y_buf,
- uint8* rgb_buf,
- int width) = YToARGBRow_C;
-#if defined(HAS_YTOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- YToARGBRow = YToARGBRow_SSE2;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
- YToARGBRow(src_y, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
return 0;
}
-// Convert I400 to ARGB.
+// Convert J400 to ARGB.
LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
+int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ J400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -239,57 +770,80 @@
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
- void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
- I400ToARGBRow_C;
-#if defined(HAS_I400TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_J400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_SSE2;
+ }
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
+#if defined(HAS_J400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J400ToARGBRow = J400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_NEON;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ J400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y;
dst_argb += dst_stride_argb;
}
return 0;
}
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
// Convert BGRA to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
- if (!src_bgra || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_bgra = src_bgra + (height - 1) * src_stride_bgra;
- src_stride_bgra = -src_stride_bgra;
- }
- void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) =
- BGRAToARGBRow_C;
-#if defined(HAS_BGRATOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- BGRAToARGBRow = BGRAToARGBRow_SSSE3;
- }
-#endif
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB),
+ width, height);
+}
- for (int y = 0; y < height; ++y) {
- BGRAToARGBRow(src_bgra, dst_argb, width);
- src_bgra += src_stride_bgra;
- dst_argb += dst_stride_argb;
- }
- return 0;
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB),
+ width, height);
}
// Convert ABGR to ARGB.
@@ -297,33 +851,21 @@
int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
- if (!src_abgr || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_abgr = src_abgr + (height - 1) * src_stride_abgr;
- src_stride_abgr = -src_stride_abgr;
- }
- void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) =
- ABGRToARGBRow_C;
-#if defined(HAS_ABGRTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ABGRToARGBRow = ABGRToARGBRow_SSSE3;
- }
-#endif
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB),
+ width, height);
+}
- for (int y = 0; y < height; ++y) {
- ABGRToARGBRow(src_abgr, dst_argb, width);
- src_abgr += src_stride_abgr;
- dst_argb += dst_stride_argb;
- }
- return 0;
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB),
+ width, height);
}
// Convert RGBA to ARGB.
@@ -331,66 +873,10 @@
int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
- if (!src_rgba || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_rgba = src_rgba + (height - 1) * src_stride_rgba;
- src_stride_rgba = -src_stride_rgba;
- }
- void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) =
- RGBAToARGBRow_C;
-#if defined(HAS_RGBATOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RGBAToARGBRow = RGBAToARGBRow_SSSE3;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- RGBAToARGBRow(src_rgba, dst_argb, width);
- src_rgba += src_stride_rgba;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (!src_raw || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
- void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) =
- RAWToARGBRow_C;
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- RAWToARGBRow(src_raw, dst_argb, width);
- src_raw += src_stride_raw;
- dst_argb += dst_stride_argb;
- }
- return 0;
+ return ARGBShuffle(src_rgba, src_stride_rgba,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskRGBAToARGB),
+ width, height);
}
// Convert RGB24 to ARGB.
@@ -398,6 +884,9 @@
int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RGB24ToARGBRow_C;
if (!src_rgb24 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -408,17 +897,31 @@
src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
src_stride_rgb24 = -src_stride_rgb24;
}
- void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) =
- RGB24ToARGBRow_C;
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
+ }
#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
src_rgb24 += src_stride_rgb24;
dst_argb += dst_stride_argb;
@@ -426,11 +929,64 @@
return 0;
}
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ RAWToARGBRow_C;
+ if (!src_raw || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
+ }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
// Convert RGB565 to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
+ RGB565ToARGBRow_C;
if (!src_rgb565 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -441,17 +997,39 @@
src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
src_stride_rgb565 = -src_stride_rgb565;
}
- void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
- RGB565ToARGBRow_C;
+ // Coalesce rows.
+ if (src_stride_rgb565 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
+ }
#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
RGB565ToARGBRow(src_rgb565, dst_argb, width);
src_rgb565 += src_stride_rgb565;
dst_argb += dst_stride_argb;
@@ -464,8 +1042,11 @@
int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
if (!src_argb1555 || !dst_argb ||
- width <= 0 || height == 0) {
+ width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -474,17 +1055,39 @@
src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
src_stride_argb1555 = -src_stride_argb1555;
}
- void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
- int pix) = ARGB1555ToARGBRow_C;
+ // Coalesce rows.
+ if (src_stride_argb1555 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
+ }
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
src_argb1555 += src_stride_argb1555;
dst_argb += dst_stride_argb;
@@ -497,6 +1100,9 @@
int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
if (!src_argb4444 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -507,17 +1113,39 @@
src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
src_stride_argb4444 = -src_stride_argb4444;
}
- void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
- int pix) = ARGB4444ToARGBRow_C;
+ // Coalesce rows.
+ if (src_stride_argb4444 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
+ }
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
src_argb4444 += src_stride_argb4444;
dst_argb += dst_stride_argb;
@@ -531,6 +1159,12 @@
const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = NV12ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -541,23 +1175,24 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToARGBRow_C;
#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -565,8 +1200,8 @@
}
#endif
- for (int y = 0; y < height; ++y) {
- NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -582,6 +1217,12 @@
const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*NV21ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = NV21ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -592,23 +1233,24 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*NV21ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV21ToARGBRow_C;
#if defined(HAS_NV21TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
- }
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_NV21TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ if (TestCpuFlag(kCpuHasNEON)) {
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_NEON;
@@ -616,8 +1258,8 @@
}
#endif
- for (int y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -632,6 +1274,12 @@
int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = NV12ToARGBRow_C;
if (!src_m420 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -642,31 +1290,42 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToARGBRow_C;
#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
}
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ for (y = 0; y < height - 1; y += 2) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+ &kYuvI601Constants, width);
NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, width);
+ dst_argb + dst_stride_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb * 2;
src_m420 += src_stride_m420 * 3;
}
if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+ &kYuvI601Constants, width);
}
return 0;
}
@@ -676,6 +1335,12 @@
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) =
+ YUY2ToARGBRow_C;
if (!src_yuy2 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -686,72 +1351,39 @@
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
- void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) = YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int pix) = YUY2ToYRow_C;
-#if defined(HAS_YUY2TOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
}
}
-#elif defined(HAS_YUY2TOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
}
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_NEON;
- }
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
}
}
#endif
-
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-
- SIMD_ALIGNED(uint8 rowy[kMaxStride]);
- SIMD_ALIGNED(uint8 rowu[kMaxStride]);
- SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-
- for (int y = 0; y < height; ++y) {
- YUY2ToUV422Row(src_yuy2, rowu, rowv, width);
- YUY2ToYRow(src_yuy2, rowy, width);
- I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
src_yuy2 += src_stride_yuy2;
dst_argb += dst_stride_argb;
}
@@ -763,6 +1395,12 @@
int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*UYVYToARGBRow)(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) =
+ UYVYToARGBRow_C;
if (!src_uyvy || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -773,527 +1411,45 @@
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
- void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int pix) = UYVYToUV422Row_C;
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix) = UYVYToYRow_C;
-#if defined(HAS_UYVYTOYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
+ }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_SSE2;
- UYVYToYRow = UYVYToYRow_SSE2;
- }
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
}
}
#endif
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToARGBRow = UYVYToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
}
}
#endif
-
- SIMD_ALIGNED(uint8 rowy[kMaxStride]);
- SIMD_ALIGNED(uint8 rowu[kMaxStride]);
- SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-
- for (int y = 0; y < height; ++y) {
- UYVYToUV422Row(src_uyvy, rowu, rowv, width);
- UYVYToYRow(src_uyvy, rowy, width);
- I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+ for (y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
src_uyvy += src_stride_uyvy;
dst_argb += dst_stride_argb;
}
return 0;
}
-#ifdef HAVE_JPEG
-struct ARGBBuffers {
- uint8* argb;
- int argb_stride;
- int w;
- int h;
-};
-
-static void JpegI420ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
- I420ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
- I422ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI444ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
- I444ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
- I411ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI400ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
- I400ToARGB(data[0], strides[0],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample,
- size_t sample_size,
- uint8* argb, int argb_stride,
- int w, int h,
- int dw, int dh) {
- if (sample_size == kUnknownDataSize) {
- // ERROR: MJPEG frame size unknown
- return -1;
- }
-
- // TODO(fbarchard): Port to C
- MJpegDecoder mjpeg_decoder;
- bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
- // ERROR: MJPEG frame has unexpected dimensions
- mjpeg_decoder.UnloadFrame();
- return 1; // runtime failure
- }
- if (ret) {
- ARGBBuffers bufs = { argb, argb_stride, dw, dh };
- // YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 2 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
- // YUV422
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
- // YUV444
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
- // YUV400
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceGrayscale &&
- mjpeg_decoder.GetNumComponents() == 1 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
- } else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
- // ERROR: Unable to convert MJPEG frame because format is not supported
- mjpeg_decoder.UnloadFrame();
- return 1;
- }
- }
- return 0;
-}
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-// With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
- uint8* dst_argb, int argb_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int dst_width, int dst_height,
- RotationMode rotation,
- uint32 format) {
- if (dst_argb == NULL || sample == NULL ||
- src_width <= 0 || dst_width <= 0 ||
- src_height == 0 || dst_height == 0) {
- return -1;
- }
- int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
- int abs_src_height = (src_height < 0) ? -src_height : src_height;
- int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
- if (src_height < 0) {
- inv_dst_height = -inv_dst_height;
- }
- int r = 0;
-
- // One pass rotation is available for some formats. For the rest, convert
- // to I420 (with optional vertical flipping) into a temporary I420 buffer,
- // and then rotate the I420 to the final destination buffer.
- // For in-place conversion, if destination dst_argb is same as source sample,
- // also enable temporary buffer.
- bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
- uint8* tmp_argb = dst_argb;
- int tmp_argb_stride = argb_stride;
- uint8* buf = NULL;
- int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
- if (need_buf) {
- int argb_size = dst_width * abs_dst_height * 4;
- buf = new uint8[argb_size];
- if (!buf) {
- return 1; // Out of memory runtime error.
- }
- dst_argb = buf;
- argb_stride = dst_width;
- }
-
- switch (format) {
- // Single plane formats
- case FOURCC_YUY2:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToARGB(src, aligned_src_width * 2,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_UYVY:
- src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToARGB(src, aligned_src_width * 2,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-// case FOURCC_V210:
- // stride is multiple of 48 pixels (128 bytes).
- // pixels come in groups of 6 = 16 bytes
-// src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
-// crop_x / 6 * 16;
-// r = V210ToARGB(src, (aligned_src_width + 47) / 48 * 128,
-// dst_argb, argb_stride,
-// dst_width, inv_dst_height);
-// break;
- case FOURCC_24BG:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToARGB(src, src_width * 3,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RAW:
- src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToARGB(src, src_width * 3,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_ARGB:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToARGB(src, src_width * 4,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_BGRA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToARGB(src, src_width * 4,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_ABGR:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToARGB(src, src_width * 4,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBA:
- src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToARGB(src, src_width * 4,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBP:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToARGB(src, src_width * 2,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_RGBO:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToARGB(src, src_width * 2,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_R444:
- src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToARGB(src, src_width * 2,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- // TODO(fbarchard): Support cropping Bayer by odd numbers
- // by adjusting fourcc.
- case FOURCC_BGGR:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerBGGRToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_GBRG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGBRGToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_GRBG:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerGRBGToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_RGGB:
- src = sample + (src_width * crop_y + crop_x);
- r = BayerRGGBToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-
- case FOURCC_I400:
- src = sample + src_width * crop_y + crop_x;
- r = I400ToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-
- // Biplanar formats
- case FOURCC_NV12:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToARGB(src, src_width,
- src_uv, aligned_src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_NV21:
- src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- // Call NV12 but with u and v parameters swapped.
- r = NV21ToARGB(src, src_width,
- src_uv, aligned_src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
-// case FOURCC_Q420:
-// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-// src_width + crop_x * 2;
-// r = Q420ToARGB(src, src_width * 3,
-// src_uv, src_width * 3,
-// dst_argb, argb_stride,
-// dst_width, inv_dst_height);
-// break;
- // Triplanar formats
- case FOURCC_I420:
- case FOURCC_YU12:
- case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- int halfheight = (abs_src_height + 1) / 2;
- if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- }
- r = I420ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- }
- case FOURCC_I422:
- case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- int halfwidth = (src_width + 1) / 2;
- if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- } else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
- }
- r = I422ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- }
- case FOURCC_I444:
- case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
- if (format == FOURCC_YV24) {
- src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- } else {
- src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
- src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
- }
- r = I444ToARGB(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToARGB(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- dst_argb, argb_stride,
- dst_width, inv_dst_height);
- break;
- }
-#ifdef HAVE_JPEG
- case FOURCC_MJPG:
- r = MJPGToARGB(sample, sample_size,
- dst_argb, argb_stride,
- src_width, abs_src_height, dst_width, inv_dst_height);
- break;
-#endif
- default:
- r = -1; // unknown fourcc - return failure code.
- }
-
- if (need_buf) {
- if (!r) {
- r = ARGBRotate(dst_argb, argb_stride,
- tmp_argb, tmp_argb_stride,
- dst_width, abs_dst_height, rotation);
- }
- delete buf;
- }
-
- return r;
-}
-
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 4ea974a..46abdeb 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -13,9 +13,9 @@
#include "libyuv/basic_types.h"
#include "libyuv/convert.h" // For I420Copy
#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
+#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/video_common.h"
#include "libyuv/row.h"
@@ -24,6 +24,42 @@
extern "C" {
#endif
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int dst_uv_width, int dst_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 ||
+ dst_uv_width <= 0 || dst_uv_height <= 0) {
+ return -1;
+ }
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
LIBYUV_API
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -32,78 +68,20 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
- CopyRow = CopyRow_NEON;
- }
-#elif defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(halfwidth, 4)) {
- CopyRow = CopyRow_X86;
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // UpSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_u, dst_u, halfwidth);
- CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
- src_u += src_stride_u;
- dst_u += dst_stride_u * 2;
- }
- if (height & 1) {
- CopyRow(src_u, dst_u, halfwidth);
- }
-
- // UpSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_v, dst_v, halfwidth);
- CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
- src_v += src_stride_v;
- dst_v += dst_stride_v * 2;
- }
- if (height & 1) {
- CopyRow(src_v, dst_v, halfwidth);
- }
- return 0;
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr);
-
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -112,44 +90,16 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u|| !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
-
- // Upsample U plane.
- ScalePlaneBilinear(halfwidth, halfheight,
- width, height,
- src_stride_u,
- dst_stride_u,
- src_u, dst_u);
-
- // Upsample V plane.
- ScalePlaneBilinear(halfwidth, halfheight,
- width, height,
- src_stride_v,
- dst_stride_v,
- src_v, dst_v);
- return 0;
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
// 420 chroma is 1/2 width, 1/2 height
@@ -162,45 +112,16 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- int quarterwidth = (width + 3) >> 2;
-
- // Resample U plane.
- ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
- quarterwidth, height, // to 1/4 width, 1x height
- src_stride_u,
- dst_stride_u,
- src_u, dst_u);
-
- // Resample V plane.
- ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
- quarterwidth, height, // to 1/4 width, 1x height
- src_stride_v,
- dst_stride_v,
- src_v, dst_v);
- return 0;
+ const int dst_uv_width = (Abs(width) + 3) >> 2;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
@@ -222,282 +143,58 @@
return 0;
}
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_I42XTOYUY2ROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I42xToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
-
- align 16
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqa xmm0, [eax] // Y
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2 // YUYV
- punpckhbw xmm1, xmm2
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#define HAS_I42XTOUYVYROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void I42xToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
- sub edx, esi
-
- align 16
- convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
- lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqa xmm0, [eax] // Y
- movdqa xmm1, xmm2
- lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
- punpckhbw xmm2, xmm0
- movdqa [edi], xmm1
- movdqa [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_I42XTOYUY2ROW_SSE2
-static void I42xToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq (%1,%2,1),%%xmm3 \n"
- "lea 0x8(%1),%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%3) \n"
- "movdqa %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#define HAS_I42XTOUYVYROW_SSE2
-static void I42xToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq (%1,%2,1),%%xmm3 \n"
- "lea 0x8(%1),%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,(%3) \n"
- "movdqa %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-#endif
-
-static void I42xToYUY2Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- for (int x = 0; x < width - 1; x += 2) {
- dst_frame[0] = src_y[0];
- dst_frame[1] = src_u[0];
- dst_frame[2] = src_y[1];
- dst_frame[3] = src_v[0];
- dst_frame += 4;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- }
- if (width & 1) {
- dst_frame[0] = src_y[0];
- dst_frame[1] = src_u[0];
- dst_frame[2] = src_y[0]; // duplicate last y
- dst_frame[3] = src_v[0];
- }
-}
-
-static void I42xToUYVYRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
- for (int x = 0; x < width - 1; x += 2) {
- dst_frame[0] = src_u[0];
- dst_frame[1] = src_y[0];
- dst_frame[2] = src_v[0];
- dst_frame[3] = src_y[1];
- dst_frame += 4;
- src_y += 2;
- src_u += 1;
- src_v += 1;
- }
- if (width & 1) {
- dst_frame[0] = src_u[0];
- dst_frame[1] = src_y[0];
- dst_frame[2] = src_v[0];
- dst_frame[3] = src_y[0]; // duplicate last y
- }
-}
-
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
- defined(__i386__) || defined(_M_IX86) || \
- defined(__arm__) || defined(_M_ARM) || \
- (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
-#else
-static inline void WRITEWORD(uint8* p, uint32 v) {
- p[0] = (uint8)(v & 255);
- p[1] = (uint8)((v >> 8) & 255);
- p[2] = (uint8)((v >> 16) & 255);
- p[3] = (uint8)((v >> 24) & 255);
-}
-#endif
-
-#define EIGHTTOTEN(x) (x << 2 | x >> 6)
-static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
- for (int x = 0; x < width; x += 6) {
- WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) |
- (EIGHTTOTEN(src_uyvy[1]) << 10) |
- (EIGHTTOTEN(src_uyvy[2]) << 20));
- WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) |
- (EIGHTTOTEN(src_uyvy[4]) << 10) |
- (EIGHTTOTEN(src_uyvy[5]) << 20));
- WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) |
- (EIGHTTOTEN(src_uyvy[7]) << 10) |
- (EIGHTTOTEN(src_uyvy[8]) << 20));
- WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) |
- (EIGHTTOTEN(src_uyvy[10]) << 10) |
- (EIGHTTOTEN(src_uyvy[11]) << 20));
- src_uyvy += 12;
- dst_v210 += 16;
- }
-}
-
-// TODO(fbarchard): Deprecate, move or expand 422 support?
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
- if (!src_y || !src_u || !src_v || !dst_frame ||
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_frame = dst_frame + (height - 1) * dst_stride_frame;
- dst_stride_frame = -dst_stride_frame;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
}
- void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_frame, int width) =
- I42xToYUY2Row_C;
-#if defined(HAS_I42XTOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
- I42xToYUY2Row = I42xToYUY2Row_SSE2;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
- I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width);
+ for (y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
- dst_frame += dst_stride_frame;
+ dst_yuy2 += dst_stride_yuy2;
}
return 0;
}
@@ -506,80 +203,106 @@
int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
- if (!src_y || !src_u || !src_v || !dst_frame ||
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_frame = dst_frame + (height - 1) * dst_stride_frame;
- dst_stride_frame = -dst_stride_frame;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
}
- void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_frame, int width) =
- I42xToYUY2Row_C;
-#if defined(HAS_I42XTOYUY2ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
- I42xToYUY2Row = I42xToYUY2Row_SSE2;
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
- I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
- I42xToYUY2Row(src_y + src_stride_y, src_u, src_v,
- dst_frame + dst_stride_frame, width);
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
src_y += src_stride_y * 2;
src_u += src_stride_u;
src_v += src_stride_v;
- dst_frame += dst_stride_frame * 2;
+ dst_yuy2 += dst_stride_yuy2 * 2;
}
if (height & 1) {
- I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
}
return 0;
}
-// TODO(fbarchard): Deprecate, move or expand 422 support?
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
- if (!src_y || !src_u || !src_v || !dst_frame ||
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_frame = dst_frame + (height - 1) * dst_stride_frame;
- dst_stride_frame = -dst_stride_frame;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
}
- void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_frame, int width) =
- I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
- I42xToUYVYRow = I42xToUYVYRow_SSE2;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
- I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+ for (y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
- dst_frame += dst_stride_frame;
+ dst_uyvy += dst_stride_uyvy;
}
return 0;
}
@@ -588,249 +311,206 @@
int I420ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
- if (!src_y || !src_u || !src_v || !dst_frame ||
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_frame = dst_frame + (height - 1) * dst_stride_frame;
- dst_stride_frame = -dst_stride_frame;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
}
- void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_frame, int width) =
- I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
- I42xToUYVYRow = I42xToUYVYRow_SSE2;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
- I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
- I42xToUYVYRow(src_y + src_stride_y, src_u, src_v,
- dst_frame + dst_stride_frame, width);
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
src_y += src_stride_y * 2;
src_u += src_stride_u;
src_v += src_stride_v;
- dst_frame += dst_stride_frame * 2;
+ dst_uyvy += dst_stride_uyvy * 2;
}
if (height & 1) {
- I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
}
return 0;
}
LIBYUV_API
-int I420ToV210(const uint8* src_y, int src_stride_y,
+int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
int width, int height) {
- if (width * 16 / 6 > kMaxStride) { // Row buffer of V210 is required.
- return -1;
- } else if (!src_y || !src_u || !src_v || !dst_frame ||
+ int y;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Coalesce rows.
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_frame = dst_frame + (height - 1) * dst_stride_frame;
- dst_stride_frame = -dst_stride_frame;
+ halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_uv = -dst_stride_uv;
}
-
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix);
- UYVYToV210Row = UYVYToV210Row_C;
-
- void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_frame, int width) =
- I42xToUYVYRow_C;
-#if defined(HAS_I42XTOUYVYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
- I42xToUYVYRow = I42xToUYVYRow_SSE2;
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_u == halfwidth &&
+ src_stride_v == halfwidth &&
+ dst_stride_uv == halfwidth * 2) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height - 1; y += 2) {
- I42xToUYVYRow(src_y, src_u, src_v, row, width);
- UYVYToV210Row(row, dst_frame, width);
- I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width);
- UYVYToV210Row(row, dst_frame + dst_stride_frame, width);
-
- src_y += src_stride_y * 2;
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ for (y = 0; y < halfheight; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
src_u += src_stride_u;
src_v += src_stride_v;
- dst_frame += dst_stride_frame * 2;
- }
- if (height & 1) {
- I42xToUYVYRow(src_y, src_u, src_v, row, width);
- UYVYToV210Row(row, dst_frame, width);
+ dst_uv += dst_stride_uv;
}
return 0;
}
-// Convert I420 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
int width, int height) {
- if (!src_y || !src_u || !src_v || !dst_argb ||
+ return I420ToNV12(src_y, src_stride_y,
+ src_v, src_stride_v,
+ src_u, src_stride_u,
+ dst_y, dst_stride_y,
+ dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix
+static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
}
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_bgra ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
- dst_stride_bgra = -dst_stride_bgra;
- }
- void (*I422ToBGRARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToBGRARow_C;
-#if defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToBGRARow = I422ToBGRARow_Any_NEON;
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToBGRARow = I422ToBGRARow_NEON;
- }
- }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_AVX2;
}
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
- dst_bgra += dst_stride_bgra;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_abgr ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
- dst_stride_abgr = -dst_stride_abgr;
- }
- void (*I422ToABGRRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToABGRRow_C;
-#if defined(HAS_I422TOABGRROW_NEON)
+#if defined(HAS_I422TORGBAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToABGRRow = I422ToABGRRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToABGRRow = I422ToABGRRow_NEON;
- }
- }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_NEON;
}
}
#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+ I422ToRGBARow = I422ToRGBARow_DSPR2;
+ }
+#endif
- for (int y = 0; y < height; ++y) {
- I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
- dst_abgr += dst_stride_abgr;
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@@ -847,44 +527,81 @@
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_rgba ||
+ return I420ToRGBAMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix
+static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
}
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGBARow_C;
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
-#elif defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
}
}
#endif
- for (int y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
- dst_rgba += dst_stride_rgba;
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@@ -901,51 +618,12 @@
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_rgb24 ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
- void (*I422ToRGB24Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGB24Row_C;
-#if defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#elif defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
+ return I420ToRGB24Matrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants,
+ width, height);
}
// Convert I420 to RAW.
@@ -955,44 +633,130 @@
const uint8* src_v, int src_stride_v,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_raw ||
+ return I420ToRGB24Matrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_raw = dst_raw + (height - 1) * dst_stride_raw;
- dst_stride_raw = -dst_stride_raw;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
}
- void (*I422ToRAWRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRAWRow_C;
-#if defined(HAS_I422TORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRAWRow = I422ToRAWRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToRAWRow = I422ToRAWRow_NEON;
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
}
}
-#elif defined(HAS_I422TORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
- I422ToRAWRow = I422ToRAWRow_SSSE3;
- }
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
}
}
#endif
- for (int y = 0; y < height; ++y) {
- I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
- dst_raw += dst_stride_raw;
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@@ -1007,52 +771,53 @@
int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_rgb, int dst_stride_rgb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_rgb ||
+ int y;
+ void (*I422ToRGB565Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
- dst_stride_rgb = -dst_stride_rgb;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
}
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+#if defined(HAS_I422TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
}
#endif
-
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix) =
- ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width * 2 <= kMaxStride) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
}
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
}
}
#endif
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToRGB565Row(row, dst_rgb, width);
- dst_rgb += dst_stride_rgb;
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@@ -1062,122 +827,114 @@
return 0;
}
-// Convert I420 to ARGB1555.
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+ 0, 4, 1, 5,
+ 6, 2, 7, 3,
+ 1, 5, 0, 4,
+ 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ const uint8* dither4x4, int width, int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
}
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
}
#endif
-
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB1555Row_C;
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- if (width * 2 <= kMaxStride) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
- }
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToARGB1555Row(row, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
}
}
- return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
-#endif
-
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB4444Row_C;
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- if (width * 2 <= kMaxStride) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
- }
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToARGB4444Row(row, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
}
+ free_aligned_buffer_64(row_argb);
}
return 0;
}
@@ -1189,12 +946,13 @@
const uint8* v, int v_stride,
uint8* dst_sample, int dst_sample_stride,
int width, int height,
- uint32 format) {
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int r = 0;
if (!y || !u|| !v || !dst_sample ||
width <= 0 || height == 0) {
return -1;
}
- int r = 0;
switch (format) {
// Single plane formats
case FOURCC_YUY2:
@@ -1213,15 +971,6 @@
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
break;
- case FOURCC_V210:
- r = I420ToV210(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride :
- (width + 47) / 48 * 128,
- width, height);
- break;
case FOURCC_RGBP:
r = I420ToRGB565(y, y_stride,
u, u_stride,
@@ -1294,48 +1043,40 @@
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
break;
- case FOURCC_BGGR:
- r = I420ToBayerBGGR(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GBRG:
- r = I420ToBayerGBRG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_GRBG:
- r = I420ToBayerGRBG(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
- case FOURCC_RGGB:
- r = I420ToBayerRGGB(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
- break;
case FOURCC_I400:
r = I400Copy(y, y_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
break;
+ case FOURCC_NV12: {
+ uint8* dst_uv = dst_sample + width * height;
+ r = I420ToNV12(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_uv,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ case FOURCC_NV21: {
+ uint8* dst_vu = dst_sample + width * height;
+ r = I420ToNV21(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_vu,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ // TODO(fbarchard): Add M420.
// Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:
- case FOURCC_YU12:
case FOURCC_YV12: {
int halfwidth = (width + 1) / 2;
int halfheight = (height + 1) / 2;
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
new file mode 100644
index 0000000..2a8682b
--- /dev/null
+++ b/files/source/convert_from_argb.cc
@@ -0,0 +1,1286 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) = ARGBToUV444Row_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) = ARGBToUV411Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 4 == width &&
+ dst_stride_v * 4 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV411Row = ARGBToUV411Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+ if (!src_argb || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+ src_argb += src_stride_argb;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+ if (!src_argb || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+ src_argb += src_stride_argb;
+ dst_uyvy += dst_stride_uyvy;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ return ARGBShuffle(src_argb, src_stride_argb,
+ dst_rgba, dst_stride_rgba,
+ (const uint8*)(&kShuffleMaskARGBToRGBA),
+ width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ int y;
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRGB24Row_C;
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ int y;
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRAWRow_C;
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_raw == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_raw = 0;
+ }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+ 0, 4, 1, 5,
+ 6, 2, 7, 3,
+ 1, 5, 0, 4,
+ 7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ const uint8* dither4x4, int width, int height) {
+ int y;
+ void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+ *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ int y;
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToRGB565Row_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb565 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb565 = 0;
+ }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ int y;
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToARGB1555Row_C;
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb1555 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb1555 = 0;
+ }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ int y;
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ ARGBToARGB4444Row_C;
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb4444 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb4444 = 0;
+ }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb ||
+ !dst_yj || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+ src_argb += src_stride_argb * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb ||
+ !dst_yj || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yj == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height) {
+ int y;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
new file mode 100644
index 0000000..90f550a
--- /dev/null
+++ b/files/source/convert_jpeg.cc
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8* y;
+ int y_stride;
+ uint8* u;
+ int u_stride;
+ uint8* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I420Copy(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I422ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I444ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I411ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I400ToI420(data[0], strides[0],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+ int* width, int* height) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret) {
+ *width = mjpeg_decoder.GetWidth();
+ *height = mjpeg_decoder.GetHeight();
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret ? 0 : -1; // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+ uint8* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I420ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I422ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I444ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I411ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I400ToARGB(data[0], strides[0],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
new file mode 100644
index 0000000..aecdc80
--- /dev/null
+++ b/files/source/convert_to_argb.cc
@@ -0,0 +1,305 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+ uint8* crop_argb, int argb_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination crop_argb is same as source sample,
+ // also enable temporary buffer.
+ LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+ crop_argb == sample;
+ uint8* dest_argb = crop_argb;
+ int dest_argb_stride = argb_stride;
+ uint8* rotate_buffer = NULL;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+ if (crop_argb == NULL || sample == NULL ||
+ src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
+
+ if (need_buf) {
+ int argb_size = crop_width * 4 * abs_crop_height;
+ rotate_buffer = (uint8*)malloc(argb_size);
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ crop_argb = rotate_buffer;
+ argb_stride = crop_width * 4;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_J420: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToARGB(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size,
+ crop_argb, argb_stride,
+ src_width, abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(crop_argb, argb_stride,
+ dest_argb, dest_argb_stride,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
new file mode 100644
index 0000000..e5f307c
--- /dev/null
+++ b/files/source/convert_to_i420.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ // TODO(nisse): Why allow crop_height < 0?
+ const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+ LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+ format != FOURCC_NV12 && format != FOURCC_NV21 &&
+ format != FOURCC_YV12) || y == sample;
+ uint8* tmp_y = y;
+ uint8* tmp_u = u;
+ uint8* tmp_v = v;
+ int tmp_y_stride = y_stride;
+ int tmp_u_stride = u_stride;
+ int tmp_v_stride = v_stride;
+ uint8* rotate_buffer = NULL;
+ const int inv_crop_height =
+ (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+ if (!y || !u || !v || !sample ||
+ src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination y is same as source sample,
+ // also enable temporary buffer.
+ if (need_buf) {
+ int y_size = crop_width * abs_crop_height;
+ int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+ rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ y = rotate_buffer;
+ u = y + y_size;
+ v = u + uv_size;
+ y_stride = crop_width;
+ u_stride = v_stride = ((crop_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ // Call NV12 but with u and v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ v, v_stride,
+ u, u_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420Rotate(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToI420(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToI420(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ src_width, abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 2e96d9b..84927eb 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -4,21 +4,24 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/cpu_id.h"
-#ifdef _MSC_VER
-#include <intrin.h> // For __cpuid()
+#if defined(_MSC_VER)
+#include <intrin.h> // For __cpuidex()
#endif
-#if !defined(__CLR_VER) && defined(_M_X64) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
+#if !defined(__native_client__)
#include <stdlib.h> // For getenv()
+#endif
// For ArmCpuCaps() but unittested on all platforms
#include <stdio.h>
@@ -26,186 +29,269 @@
#include "libyuv/basic_types.h" // For CPU_X86
-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "cpuid \n"
- : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
-}
-#endif
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// Low level cpuid for X86. Returns zeros on other CPUs.
-#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
- defined(__i386__) || defined(__x86_64__))
-LIBYUV_API
-void CpuId(int cpu_info[4], int info_type) {
- __cpuid(cpu_info, info_type);
-}
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+ !defined(__clang__)
+#define SAFEBUFFERS __declspec(safebuffers)
#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__i386__) || defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
-void CpuId(int cpu_info[4], int) {
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+ __asm {
+ mov eax, info_eax
+ mov ecx, info_ecx
+ mov edi, cpu_info
+ cpuid
+ mov [edi], eax
+ mov [edi + 4], ebx
+ mov [edi + 8], ecx
+ mov [edi + 12], edx
+ }
+#else // Visual C but not x86
+ if (info_ecx == 0) {
+ __cpuid((int*)(cpu_info), info_eax);
+ } else {
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+ }
+#endif
+// GCC version uses inline x86 assembly.
+#else // defined(_MSC_VER)
+ uint32 info_ebx, info_edx;
+ asm volatile (
+#if defined( __i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D" (info_ebx),
+#else
+ "cpuid \n"
+ : "=b" (info_ebx),
+#endif // defined( __i386__) && defined(__PIC__)
+ "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+ cpu_info[0] = info_eax;
+ cpu_info[1] = info_ebx;
+ cpu_info[2] = info_ecx;
+ cpu_info[3] = info_edx;
+#endif // defined(_MSC_VER)
+}
+#else // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-#if !defined(__CLR_VER) && defined(_M_X64) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
- return static_cast<uint32>(_xgetbv(xcr));
-}
-#elif !defined(__CLR_VER) && defined(_M_IX86)
-#define HAS_XGETBV
-__declspec(naked) __declspec(align(16))
-static uint32 XGetBV(unsigned int xcr) {
- __asm {
- mov ecx, [esp + 4] // xcr
- _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005.
- ret
- }
-}
-#elif defined(__i386__) || defined(__x86_64__)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
- uint32 xcr_feature_mask;
- asm volatile ( // NOLINT
- ".byte 0x0f, 0x01, 0xd0\n"
- : "=a"(xcr_feature_mask)
- : "c"(xcr)
- : "memory", "cc", "edx"); // edx unused.
- return xcr_feature_mask;
-}
+// For VS2010 and earlier emit can be used:
+// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+// __asm {
+// xor ecx, ecx // xcr 0
+// xgetbv
+// mov xcr0, eax
+// }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
#endif
-#ifdef HAS_XGETBV
-static const int kXCR_XFEATURE_ENABLED_MASK = 0;
+#if (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__i386__) || defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+ uint32 xcr0 = 0u;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
+#elif defined(__i386__) || defined(__x86_64__)
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif // defined(__i386__) || defined(__x86_64__)
+ return xcr0;
+}
+#endif // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
- int flags = 0;
- FILE* fin = fopen(cpuinfo_name, "r");
- if (fin) {
- char buf[512];
- while (fgets(buf, 511, fin)) {
- if (memcmp(buf, "Features", 8) == 0) {
- flags |= kCpuInitialized;
- char* p = strstr(buf, " neon");
- if (p && (p[5] == ' ' || p[5] == '\n')) {
- flags |= kCpuHasNEON;
- break;
- }
+ char cpuinfo_line[512];
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // Assume Neon if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasNEON;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+ char* p = strstr(cpuinfo_line, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ // aarch64 uses asimd for Neon.
+ p = strstr(cpuinfo_line, " asimd");
+ if (p && (p[6] == ' ' || p[6] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
}
}
- fclose(fin);
}
- return flags;
+ fclose(f);
+ return 0;
}
// CPU detect function for SIMD instruction sets.
LIBYUV_API
-int cpu_info_ = 0;
+int cpu_info_ = 0; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
-static bool TestEnv(const char* name) {
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
if (var[0] != '0') {
- return true;
+ return LIBYUV_TRUE;
}
}
- return false;
+ return LIBYUV_FALSE;
}
+#else // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+ return LIBYUV_FALSE;
+}
+#endif
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
-#if !defined(__CLR_VER) && defined(CPU_X86)
- int cpu_info[4];
- __cpuid(cpu_info, 1);
- cpu_info_ = ((cpu_info[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
- ((cpu_info[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
- ((cpu_info[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
- ((cpu_info[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
- (((cpu_info[2] & 0x18000000) == 0x18000000) ? kCpuHasAVX : 0) |
- kCpuInitialized | kCpuHasX86;
+ // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+ int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+ uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+ uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+ uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+ CpuId(0, 0, cpu_info0);
+ CpuId(1, 0, cpu_info1);
+ if (cpu_info0[0] >= 7) {
+ CpuId(7, 0, cpu_info7);
+ }
+ cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ kCpuHasX86;
+
#ifdef HAS_XGETBV
- if (cpu_info_ & kCpuHasAVX) {
- __cpuid(cpu_info, 7);
- if ((cpu_info[1] & 0x00000020) &&
- ((XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06)) {
- cpu_info_ |= kCpuHasAVX2;
+ // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+ if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
+ ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
+ cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+
+ // Detect AVX512bw
+ if ((GetXCR0() & 0xe0) == 0xe0) {
+ cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
}
#endif
- // environment variable overrides for testing.
+
+ // Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
- cpu_info_ &= ~kCpuHasX86;
+ cpu_info &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
- cpu_info_ &= ~kCpuHasSSE2;
+ cpu_info &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
- cpu_info_ &= ~kCpuHasSSSE3;
+ cpu_info &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
- cpu_info_ &= ~kCpuHasSSE41;
+ cpu_info &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
- cpu_info_ &= ~kCpuHasSSE42;
+ cpu_info &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
- cpu_info_ &= ~kCpuHasAVX;
+ cpu_info &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
- cpu_info_ &= ~kCpuHasAVX2;
+ cpu_info &= ~kCpuHasAVX2;
}
- if (TestEnv("LIBYUV_DISABLE_ASM")) {
- cpu_info_ = kCpuInitialized;
+ if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+ cpu_info &= ~kCpuHasERMS;
}
-#elif defined(__arm__)
-#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
- // linux arm parse text file for neon detect.
- cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#elif defined(__ARM_NEON__)
- // gcc -mfpu=neon defines __ARM_NEON__
- // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
- // to disable Neon on devices that do not have it.
- cpu_info_ = kCpuHasNEON;
+ if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+ cpu_info &= ~kCpuHasFMA3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX3")) {
+ cpu_info &= ~kCpuHasAVX3;
+ }
#endif
- cpu_info_ |= kCpuInitialized | kCpuHasARM;
- if (TestEnv("LIBYUV_DISABLE_NEON")) {
- cpu_info_ &= ~kCpuHasNEON;
+#if defined(__mips__) && defined(__linux__)
+#if defined(__mips_dspr2)
+ cpu_info |= kCpuHasDSPR2;
+#endif
+ cpu_info |= kCpuHasMIPS;
+ if (getenv("LIBYUV_DISABLE_DSPR2")) {
+ cpu_info &= ~kCpuHasDSPR2;
}
- if (TestEnv("LIBYUV_DISABLE_ASM")) {
- cpu_info_ = kCpuInitialized;
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+ cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+ cpu_info = kCpuHasNEON;
+#else
+ // Linux arm parse text file for neon detect.
+ cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+ cpu_info |= kCpuHasARM;
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info &= ~kCpuHasNEON;
}
#endif // __arm__
- return cpu_info_;
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info = 0;
+ }
+ cpu_info |= kCpuInitialized;
+ cpu_info_ = cpu_info;
+ return cpu_info;
}
+// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
- InitCpuFlags();
- cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+ cpu_info_ = InitCpuFlags() & enable_flags;
}
#ifdef __cplusplus
diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc
deleted file mode 100644
index ed12de8..0000000
--- a/files/source/format_conversion.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write. The low level would need
-// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
- movd xmm5, [esp + 12] // selector
- mov ecx, [esp + 16] // pix
- pshufd xmm5, xmm5, 0
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- lea eax, [eax + 16]
- pshufb xmm0, xmm5
- sub ecx, 4
- movd [edx], xmm0
- lea edx, [edx + 4]
- jg wloop
- ret
- }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-#define HAS_ARGBTOBAYERROW_SSSE3
-static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) {
- asm volatile (
- "movd %3,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- ".p2align 4 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movd %%xmm0,(%1) \n"
- "lea 0x4(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_bayer), // %1
- "+r"(pix) // %2
- : "g"(selector) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
-
-);
-}
-#endif
-
-static void ARGBToBayerRow_C(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix) {
- int index0 = selector & 0xff;
- int index1 = (selector >> 8) & 0xff;
- // Copy a row of Bayer.
- for (int x = 0; x < pix - 1; x += 2) {
- dst_bayer[0] = src_argb[index0];
- dst_bayer[1] = src_argb[index1];
- src_argb += 8;
- dst_bayer += 2;
- }
- if (pix & 1) {
- dst_bayer[0] = src_argb[index0];
- }
-}
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
- return static_cast<uint32>(select0) |
- static_cast<uint32>((select1 + 4) << 8) |
- static_cast<uint32>((select0 + 8) << 16) |
- static_cast<uint32>((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
- const int green_index,
- const int red_index,
- uint32 dst_fourcc_bayer,
- uint32 *index_map) {
- // Now build a lookup table containing the indices for the four pixels in each
- // 2x2 Bayer grid.
- switch (dst_fourcc_bayer) {
- case FOURCC_BGGR:
- index_map[0] = GenerateSelector(blue_index, green_index);
- index_map[1] = GenerateSelector(green_index, red_index);
- break;
- case FOURCC_GBRG:
- index_map[0] = GenerateSelector(green_index, blue_index);
- index_map[1] = GenerateSelector(red_index, green_index);
- break;
- case FOURCC_RGGB:
- index_map[0] = GenerateSelector(red_index, green_index);
- index_map[1] = GenerateSelector(green_index, blue_index);
- break;
- case FOURCC_GRBG:
- index_map[0] = GenerateSelector(green_index, red_index);
- index_map[1] = GenerateSelector(blue_index, green_index);
- break;
- default:
- return -1; // Bad FourCC
- }
- return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
-#endif
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
-
- for (int y = 0; y < height; ++y) {
- ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
- src_argb += src_stride_argb;
- dst_bayer += dst_stride_bayer;
- }
- return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 r = src_bayer1[1];
- for (int x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- r = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer0[0];
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = AVG(r, src_bayer1[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[0];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer1[1];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 g = src_bayer0[1];
- uint8 b = src_bayer1[1];
- for (int x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[7] = 255U;
- g = src_bayer0[1];
- b = src_bayer1[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer1[1]);
- dst_argb[1] = AVG(g, src_bayer0[1]);
- dst_argb[2] = src_bayer0[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[1];
- dst_argb[5] = src_bayer0[1];
- dst_argb[6] = src_bayer0[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 b = src_bayer0[1];
- for (int x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[7] = 255U;
- b = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = AVG(b, src_bayer0[1]);
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = src_bayer1[0];
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer0[1];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer1[0];
- dst_argb[7] = 255U;
- }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_argb, int pix) {
- const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
- uint8 r = src_bayer0[1];
- for (int x = 0; x < pix - 2; x += 2) {
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- r = src_bayer0[1];
- src_bayer0 += 2;
- src_bayer1 += 2;
- dst_argb += 8;
- }
- dst_argb[0] = src_bayer1[0];
- dst_argb[1] = src_bayer0[0];
- dst_argb[2] = AVG(r, src_bayer0[1]);
- dst_argb[3] = 255U;
- if (!(pix & 1)) {
- dst_argb[4] = src_bayer1[0];
- dst_argb[5] = src_bayer0[0];
- dst_argb[6] = src_bayer0[1];
- dst_argb[7] = 255U;
- }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- uint32 src_fourcc_bayer) {
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- for (int y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- dst_argb + dst_stride_argb, width);
- src_bayer += src_stride_bayer * 2;
- dst_argb += dst_stride_argb * 2;
- }
- if (height & 1) {
- BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width);
- }
- return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- uint32 src_fourcc_bayer) {
- if (width * 4 > kMaxStride) {
- return -1; // Size too large for row buffer
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- int halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_argb, int pix);
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- }
-#endif
-
- switch (src_fourcc_bayer) {
- case FOURCC_BGGR:
- BayerRow0 = BayerRowBG;
- BayerRow1 = BayerRowGR;
- break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
- break;
- case FOURCC_GRBG:
- BayerRow0 = BayerRowGR;
- BayerRow1 = BayerRowBG;
- break;
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
- default:
- return -1; // Bad FourCC
- }
-
- for (int y = 0; y < height - 1; y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- row + kMaxStride, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- src_bayer += src_stride_bayer * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- BayerRow0(src_bayer, src_stride_bayer, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- }
- return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bayer, int dst_stride_bayer,
- int width, int height,
- uint32 dst_fourcc_bayer) {
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- int halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
-#endif
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
-#endif
- const int blue_index = 0; // Offsets for ARGB format
- const int green_index = 1;
- const int red_index = 2;
- uint32 index_map[2];
- if (MakeSelectors(blue_index, green_index, red_index,
- dst_fourcc_bayer, index_map)) {
- return -1; // Bad FourCC
- }
-
- for (int y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row, width);
- ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
- dst_bayer += dst_stride_bayer;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER) \
-LIBYUV_API \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_y, int dst_stride_y, \
- uint8* dst_u, int dst_stride_u, \
- uint8* dst_v, int dst_stride_v, \
- int width, int height) { \
- return BayerToI420(src_bayer, src_stride_bayer, \
- dst_y, dst_stride_y, \
- dst_u, dst_stride_u, \
- dst_v, dst_stride_v, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
- const uint8* src_u, int src_stride_u, \
- const uint8* src_v, int src_stride_v, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return I420ToBayer(src_y, src_stride_y, \
- src_u, src_stride_u, \
- src_v, src_stride_v, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
- uint8* dst_bayer, int dst_stride_bayer, \
- int width, int height) { \
- return ARGBToBayer(src_argb, src_stride_argb, \
- dst_bayer, dst_stride_bayer, \
- width, height, \
- FOURCC_##BAYER); \
-} \
- \
-LIBYUV_API \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
- uint8* dst_argb, int dst_stride_argb, \
- int width, int height) { \
- return BayerToARGB(src_bayer, src_stride_bayer, \
- dst_argb, dst_stride_argb, \
- width, height, \
- FOURCC_##BAYER); \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index aa60394..5081841 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -4,28 +4,41 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/mjpeg_decoder.h"
#ifdef HAVE_JPEG
-// Must be included before jpeglib
#include <assert.h>
-#ifndef __CLR_VER
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
#endif
-#include <stdio.h>
-#include <stdlib.h>
+#endif
+struct FILE; // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
extern "C" {
-#include <jpeglib.h>
-}
+#endif
-#include <climits>
-#include <cstring>
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h" // For CopyPlane().
namespace libyuv {
@@ -43,8 +56,15 @@
const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
MJpegDecoder::MJpegDecoder()
- : has_scanline_padding_(false),
+ : has_scanline_padding_(LIBYUV_FALSE),
num_outbufs_(0),
scanlines_(NULL),
scanlines_sizes_(NULL),
@@ -80,41 +100,9 @@
DestroyOutputBuffers();
}
-// Helper function to validate the jpeg looks ok.
-// TODO(fbarchard): Improve performance. Scan backward for EOI?
-bool ValidateJpeg(const uint8* sample, size_t sample_size) {
- if (sample_size < 64) {
- // ERROR: Invalid jpeg size: sample_size
- return false;
- }
- if (sample[0] != 0xff || sample[1] != 0xd8) {
- // ERROR: Invalid jpeg initial start code
- return false;
- }
- bool soi = true;
- int total_eoi = 0;
- for (int i = 2; i < static_cast<int>(sample_size) - 1; ++i) {
- if (sample[i] == 0xff) {
- if (sample[i + 1] == 0xd8) { // Start Of Image
- soi = true;
- } else if (sample[i + 1] == 0xd9) { // End Of Image
- if (soi) {
- ++total_eoi;
- }
- soi = false;
- }
- }
- }
- if (!total_eoi) {
- // ERROR: Invalid jpeg end code not found. Size sample_size
- return false;
- }
- return true;
-}
-
-bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
if (!ValidateJpeg(src, src_len)) {
- return false;
+ return LIBYUV_FALSE;
}
buf_.data = src;
@@ -125,12 +113,12 @@
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called jpeg_read_header, it experienced an error, and we called
// longjmp() and rewound the stack to here. Return error.
- return false;
+ return LIBYUV_FALSE;
}
#endif
if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
// ERROR: Bad MJPEG header
- return false;
+ return LIBYUV_FALSE;
}
AllocOutputBuffers(GetNumComponents());
for (int i = 0; i < num_outbufs_; ++i) {
@@ -160,10 +148,10 @@
}
if (GetComponentStride(i) != GetComponentWidth(i)) {
- has_scanline_padding_ = true;
+ has_scanline_padding_ = LIBYUV_TRUE;
}
}
- return true;
+ return LIBYUV_TRUE;
}
static int DivideAndRoundUp(int numerator, int denominator) {
@@ -242,45 +230,36 @@
return GetComponentWidth(component) * GetComponentHeight(component);
}
-bool MJpegDecoder::UnloadFrame() {
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called jpeg_abort_decompress, it experienced an error, and we called
// longjmp() and rewound the stack to here. Return error.
- return false;
+ return LIBYUV_FALSE;
}
#endif
jpeg_abort_decompress(decompress_struct_);
- return true;
-}
-
-static void CopyRows(uint8* source, int source_stride,
- uint8* dest, int pixels, int numrows) {
- for (int i = 0; i < numrows; ++i) {
- memcpy(dest, source, pixels);
- dest += pixels;
- source += source_stride;
- }
+ return LIBYUV_TRUE;
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-bool MJpegDecoder::DecodeToBuffers(
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
uint8** planes, int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
// ERROR: Bad dimensions
- return false;
+ return LIBYUV_FALSE;
}
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called into jpeglib, it experienced an error sometime during this
// function call, and we called longjmp() and rewound the stack to here.
// Return error.
- return false;
+ return LIBYUV_FALSE;
}
#endif
if (!StartDecode()) {
- return false;
+ return LIBYUV_FALSE;
}
SetScanlinePointers(databuf_);
int lines_left = dst_height;
@@ -294,7 +273,7 @@
while (skip >= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
skip -= GetImageScanlinesPerImcuRow();
}
@@ -303,7 +282,7 @@
// copy the parts we want into the destination.
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
@@ -313,8 +292,9 @@
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
rows_to_skip;
int data_to_skip = rows_to_skip * GetComponentStride(i);
- CopyRows(databuf_[i] + data_to_skip, GetComponentStride(i),
- planes[i], GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -326,12 +306,13 @@
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
- CopyRows(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
@@ -340,36 +321,37 @@
// Have a partial iMCU row left over to decode.
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy =
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
- CopyRows(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
return FinishDecode();
}
-bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
// ERROR: Bad dimensions
- return false;
+ return LIBYUV_FALSE;
}
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called into jpeglib, it experienced an error sometime during this
// function call, and we called longjmp() and rewound the stack to here.
// Return error.
- return false;
+ return LIBYUV_FALSE;
}
#endif
if (!StartDecode()) {
- return false;
+ return LIBYUV_FALSE;
}
SetScanlinePointers(databuf_);
int lines_left = dst_height;
@@ -379,7 +361,7 @@
while (skip >= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
skip -= GetImageScanlinesPerImcuRow();
}
@@ -387,7 +369,7 @@
// Have a partial iMCU row left over to skip.
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
@@ -414,7 +396,7 @@
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
(*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
}
@@ -422,19 +404,19 @@
// Have a partial iMCU row left over to decode.
if (!DecodeImcuRow()) {
FinishDecode();
- return false;
+ return LIBYUV_FALSE;
}
(*fn)(opaque, databuf_, databuf_strides_, lines_left);
}
return FinishDecode();
}
-void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+void init_source(j_decompress_ptr cinfo) {
fill_input_buffer(cinfo);
}
-boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
- BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data);
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data");
// ERROR: No more data
@@ -446,26 +428,28 @@
return TRUE;
}
-void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
- long num_bytes) { // NOLINT
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes;
}
-void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+void term_source(j_decompress_ptr cinfo) {
// Nothing to do.
}
#ifdef HAVE_SETJMP
-void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+void ErrorHandler(j_common_ptr cinfo) {
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
+ // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
// ERROR: Error in jpeglib: buf
+#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
@@ -514,26 +498,29 @@
}
// JDCT_IFAST and do_block_smoothing improve performance substantially.
-bool MJpegDecoder::StartDecode() {
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
decompress_struct_->raw_data_out = TRUE;
decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
decompress_struct_->dither_mode = JDITHER_NONE;
- decompress_struct_->do_fancy_upsampling = false; // Not applicable to 'raw'
- decompress_struct_->enable_2pass_quant = false; // Only for buffered mode
- decompress_struct_->do_block_smoothing = false; // blocky but fast
+ // Not applicable to 'raw':
+ decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+ // Only for buffered mode:
+ decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+ // Blocky but fast:
+ decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
if (!jpeg_start_decompress(decompress_struct_)) {
// ERROR: Couldn't start JPEG decompressor";
- return false;
+ return LIBYUV_FALSE;
}
- return true;
+ return LIBYUV_TRUE;
}
-bool MJpegDecoder::FinishDecode() {
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
// jpeglib considers it an error if we finish without decoding the whole
// image, so we call "abort" rather than "finish".
jpeg_abort_decompress(decompress_struct_);
- return true;
+ return LIBYUV_TRUE;
}
void MJpegDecoder::SetScanlinePointers(uint8** data) {
@@ -546,8 +533,8 @@
}
}
-inline bool MJpegDecoder::DecodeImcuRow() {
- return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) ==
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+ return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
jpeg_read_raw_data(decompress_struct_,
scanlines_,
GetImageScanlinesPerImcuRow());
diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc
new file mode 100644
index 0000000..9c48832
--- /dev/null
+++ b/files/source/mjpeg_validate.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h> // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+ if (sample_size >= 2) {
+ const uint8* end = sample + sample_size - 1;
+ const uint8* it = sample;
+ while (it < end) {
+ // TODO(fbarchard): scan for 0xd9 instead.
+ it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+ if (it == NULL) {
+ break;
+ }
+ if (it[1] == 0xd9) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ ++it; // Skip over current 0xff.
+ }
+ }
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+ // Maximum size that ValidateJpeg will consider valid.
+ const size_t kMaxJpegSize = 0x7fffffffull;
+ const size_t kBackSearchSize = 1024;
+ if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+ // ERROR: Invalid jpeg size: sample_size
+ return LIBYUV_FALSE;
+ }
+ if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
+ // ERROR: Invalid jpeg initial start code
+ return LIBYUV_FALSE;
+ }
+
+ // Look for the End Of Image (EOI) marker near the end of the buffer.
+ if (sample_size > kBackSearchSize) {
+ if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ // Reduce search size for forward search.
+ sample_size = sample_size - kBackSearchSize + 1;
+ }
+ // Step over SOI marker and scan for EOI.
+ return ScanEOI(sample + 2, sample_size - 2);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index a7f5086..237ab68 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -17,6 +17,7 @@
#include "libyuv/mjpeg_decoder.h"
#endif
#include "libyuv/row.h"
+#include "libyuv/scale_row.h" // for ScaleRowDown2
#ifdef __cplusplus
namespace libyuv {
@@ -28,38 +29,160 @@
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
+ int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
- CopyRow = CopyRow_NEON;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
}
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
+ // Nothing to do.
+ if (src_y == dst_y && src_stride_y == dst_stride_y) {
+ return;
}
-#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- CopyRow = CopyRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
}
#endif
// Copy plane
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
CopyRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
-// Convert I420 to I400.
LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
- uint8*, int, // src_u
- uint8*, int, // src_v
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+ uint16* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_COPYROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_16_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_16_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
if (!src_y || !dst_y || width <= 0 || height == 0) {
@@ -75,30 +198,73 @@
return 0;
}
-// Mirror a plane of data
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror a plane of data.
void MirrorPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
+ int y;
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
}
#endif
-#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSE2;
#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
+ }
#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+ MirrorRow = MirrorRow_DSPR2;
}
#endif
// Mirror plane
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
MirrorRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
@@ -112,42 +278,52 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) =
+ YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
- void (*YUY2ToUV422Row)(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int pix);
- YUY2ToYRow = YUY2ToYRow_C;
- YUY2ToUV422Row = YUY2ToUV422Row_C;
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
- YUY2ToYRow = YUY2ToYRow_Any_SSE2;
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
-#elif defined(HAS_YUY2TOYROW_NEON)
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width > 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
@@ -156,7 +332,7 @@
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
src_yuy2 += src_stride_yuy2;
@@ -174,42 +350,52 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int y;
+ void (*UYVYToUV422Row)(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) =
+ UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int width) = UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
- void (*UYVYToUV422Row)(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int pix);
- UYVYToYRow = UYVYToYRow_C;
- UYVYToUV422Row = UYVYToUV422Row_C;
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- if (width > 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
- UYVYToYRow = UYVYToYRow_Any_SSE2;
- }
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUV422Row = UYVYToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
-#elif defined(HAS_UYVYTOYROW_NEON)
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUV422Row = UYVYToUV422Row_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- if (width > 8) {
- UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width > 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
- }
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width >= 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
}
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
@@ -218,7 +404,7 @@
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
src_uyvy += src_stride_uyvy;
@@ -229,6 +415,26 @@
return 0;
}
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
// Mirror I420 with optional flipping
LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y,
@@ -238,6 +444,8 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1;
@@ -245,7 +453,7 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- int halfheight = (height + 1) >> 1;
+ halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -254,8 +462,6 @@
src_stride_v = -src_stride_v;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
if (dst_y) {
MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
@@ -269,6 +475,9 @@
int ARGBMirror(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -278,19 +487,33 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
- ARGBMirrorRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
}
#endif
// Mirror plane
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBMirrorRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
@@ -298,7 +521,7 @@
return 0;
}
-// Get a blender that optimized for the CPU, alignment and pixel count.
+// Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
LIBYUV_API
@@ -311,9 +534,9 @@
return ARGBBlendRow;
}
#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBBlendRow = ARGBBlendRow_SSE2;
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
}
#endif
return ARGBBlendRow;
@@ -325,6 +548,9 @@
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = GetARGBBlend();
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -334,10 +560,16 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width) = GetARGBBlend();
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1;
@@ -346,185 +578,427 @@
return 0;
}
-// Convert ARGB to I400.
+// Alpha Blend plane and store to destination.
LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+ const uint8* src_y1, int src_stride_y1,
+ const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height) {
- if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ int y;
+ void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+ if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
return -1;
}
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
}
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+
+ // Coalesce rows for Y plane.
+ if (src_stride_y0 == width &&
+ src_stride_y1 == width &&
+ alpha_stride == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+ }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
+ for (y = 0; y < height; ++y) {
+ BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+ src_y0 += src_stride_y0;
+ src_y1 += src_stride_y1;
+ alpha += alpha_stride;
dst_y += dst_stride_y;
}
return 0;
}
-// ARGB little endian (bgra in memory) to I422
-// same as I420 except UV plane is full height
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+ const uint8* src_u0, int src_stride_u0,
+ const uint8* src_v0, int src_stride_v0,
+ const uint8* src_y1, int src_stride_y1,
+ const uint8* src_u1, int src_stride_u1,
+ const uint8* src_v1, int src_stride_v1,
+ const uint8* alpha, int alpha_stride,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ // Half width/height for UV.
+ int halfwidth = (width + 1) >> 1;
+ void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+ if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+ !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
+
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
}
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
- ARGBToYRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+
+ // Blend Y plane.
+ BlendPlane(src_y0, src_stride_y0,
+ src_y1, src_stride_y1,
+ alpha, alpha_stride,
+ dst_y, dst_stride_y,
+ width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- if (width > 16) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
}
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
+ }
+#endif
+ if (!IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+ }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ ScaleRowDown2 = ScaleRowDown2Box_AVX2;
}
}
}
#endif
- for (int y = 0; y < height; ++y) {
- ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
+ // Row buffer for intermediate alpha pixels.
+ align_buffer_64(halfalpha, halfwidth);
+ for (y = 0; y < height; y += 2) {
+ // last row of odd height image use 1 row of alpha instead of 2.
+ if (y == (height - 1)) {
+ alpha_stride = 0;
+ }
+ // Subsample 2 rows of UV to half width and half height.
+ ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+ alpha += alpha_stride * 2;
+ BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+ BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+ src_u0 += src_stride_u0;
+ src_u1 += src_stride_u1;
dst_u += dst_stride_u;
+ src_v0 += src_stride_v0;
+ src_v1 += src_stride_v1;
dst_v += dst_stride_v;
}
+ free_aligned_buffer_64(halfalpha);
return 0;
}
-// Convert I422 to BGRA.
+// Multiply 2 ARGB images and store to destination.
LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_bgra ||
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBMultiplyRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+ }
+ }
+#endif
+
+ // Multiply plane
+ for (y = 0; y < height; ++y) {
+ ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBAddRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAddRow = ARGBAddRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAddRow = ARGBAddRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_NEON;
+ }
+ }
+#endif
+
+ // Add plane
+ for (y = 0; y < height; ++y) {
+ ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBSubtractRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_NEON;
+ }
+ }
+#endif
+
+ // Subtract plane
+ for (y = 0; y < height; ++y) {
+ ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+// Convert I422 to RGBA with matrix
+static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width, int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
- dst_stride_bgra = -dst_stride_bgra;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
}
- void (*I422ToBGRARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToBGRARow_C;
-#if defined(HAS_I422TOBGRAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToBGRARow = I422ToBGRARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToBGRARow = I422ToBGRARow_NEON;
- }
- }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
- dst_bgra += dst_stride_bgra;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_abgr ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
- dst_stride_abgr = -dst_stride_abgr;
- }
- void (*I422ToABGRRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToABGRRow_C;
-#if defined(HAS_I422TOABGRROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToABGRRow = I422ToABGRRow_Any_NEON;
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToABGRRow = I422ToABGRRow_NEON;
- }
- }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_AVX2;
}
}
#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+ I422ToRGBARow = I422ToRGBARow_DSPR2;
+ }
+#endif
- for (int y = 0; y < height; ++y) {
- I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
- dst_abgr += dst_stride_abgr;
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
@@ -539,295 +1013,43 @@
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_rgba ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToRGBARow_C;
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
+ return I422ToRGBAMatrix(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants,
+ width, height);
}
-// Convert ARGB to RGBA.
+// Convert I422 to BGRA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgba, int dst_stride_rgba,
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
- if (!src_argb || !dst_rgba ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) =
- ARGBToRGBARow_C;
-#if defined(HAS_ARGBTORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- ARGBToRGBARow = ARGBToRGBARow_SSSE3;
- }
-#endif
-#if defined(HAS_ARGBTORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- ARGBToRGBARow = ARGBToRGBARow_NEON;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToRGBARow(src_argb, dst_rgba, width);
- src_argb += src_stride_argb;
- dst_rgba += dst_stride_rgba;
- }
- return 0;
-}
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
- if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB24Row_C;
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- if (width * 3 <= kMaxStride) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
- }
- if (IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
- ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width * 3 <= kMaxStride) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
- }
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB24Row = ARGBToRGB24Row_NEON;
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToRGB24Row(src_argb, dst_rgb24, width);
- src_argb += src_stride_argb;
- dst_rgb24 += dst_stride_rgb24;
- }
- return 0;
-}
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
- if (!src_argb || !dst_raw || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRAWRow_C;
-#if defined(HAS_ARGBTORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- if (width * 3 <= kMaxStride) {
- ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
- }
- if (IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
- ARGBToRAWRow = ARGBToRAWRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTORAWROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- if (width * 3 <= kMaxStride) {
- ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
- }
- if (IS_ALIGNED(width, 8)) {
- ARGBToRAWRow = ARGBToRAWRow_NEON;
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToRAWRow(src_argb, dst_raw, width);
- src_argb += src_stride_argb;
- dst_raw += dst_stride_raw;
- }
- return 0;
-}
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- if (width * 2 <= kMaxStride) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
- }
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToRGB565Row(src_argb, dst_rgb565, width);
- src_argb += src_stride_argb;
- dst_rgb565 += dst_stride_rgb565;
- }
- return 0;
-}
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height) {
- if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB1555Row_C;
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- if (width * 2 <= kMaxStride) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
- }
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToARGB1555Row(src_argb, dst_argb1555, width);
- src_argb += src_stride_argb;
- dst_argb1555 += dst_stride_argb1555;
- }
- return 0;
-}
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height) {
- if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToARGB4444Row_C;
-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- if (width * 2 <= kMaxStride) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
- }
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
- }
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBToARGB4444Row(src_argb, dst_argb4444, width);
- src_argb += src_stride_argb;
- dst_argb4444 += dst_stride_argb4444;
- }
- return 0;
+ return I422ToRGBAMatrix(src_y, src_stride_y,
+ src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u,
+ dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
}
// Convert NV12 to RGB565.
-// TODO(fbarchard): (Re) Optimize for Neon.
LIBYUV_API
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
- if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ int y;
+ void (*NV12ToRGB565Row)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 ||
+ width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -836,33 +1058,33 @@
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV12ToARGBRow_C;
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
}
#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
}
#endif
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- NV12ToARGBRow(src_y, src_uv, row, width);
- ARGBToRGB565Row(row, dst_rgb565, width);
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -872,49 +1094,53 @@
return 0;
}
-// Convert NV21 to RGB565.
+// Convert RAW to RGB24.
LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
- if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ int y;
+ void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+ RAWToRGB24Row_C;
+ if (!src_raw || !dst_rgb24 ||
+ width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
}
- void (*NV21ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- int width) = NV21ToARGBRow_C;
-#if defined(HAS_NV21TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 &&
+ dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgb24 = 0;
}
-#endif
-
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
- ARGBToRGB565Row_C;
-#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_vu, row, width);
- ARGBToRGB565Row(row, dst_rgb565, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_vu += src_stride_vu;
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_SSSE3;
}
}
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGB24Row(src_raw, dst_rgb24, width);
+ src_raw += src_stride_raw;
+ dst_rgb24 += dst_stride_rgb24;
+ }
return 0;
}
@@ -922,24 +1148,44 @@
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value) {
- void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow8_C;
+ int y;
+ void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ dst_stride_y = 0;
+ }
#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- SetRow = SetRow8_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SetRow = SetRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_NEON;
+ }
}
#endif
#if defined(HAS_SETROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- SetRow = SetRow8_X86;
+ if (TestCpuFlag(kCpuHasX86)) {
+ SetRow = SetRow_Any_X86;
+ if (IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ SetRow = SetRow_ERMS;
}
#endif
- uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
// Set plane
- for (int y = 0; y < height; ++y) {
- SetRow(dst_y, v32, width);
+ for (y = 0; y < height; ++y) {
+ SetRow(dst_y, value, width);
dst_y += dst_stride_y;
}
}
@@ -952,19 +1198,19 @@
int x, int y,
int width, int height,
int value_y, int value_u, int value_v) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8* start_y = dst_y + y * dst_stride_y + x;
+ uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v ||
- width <= 0 || height <= 0 ||
+ width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
return -1;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- uint8* start_y = dst_y + y * dst_stride_y + x;
- uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
- uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
SetPlane(start_y, dst_stride_y, width, height, value_y);
SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
@@ -978,26 +1224,45 @@
int dst_x, int dst_y,
int width, int height,
uint32 value) {
+ int y;
+ void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
if (!dst_argb ||
- width <= 0 || height <= 0 ||
+ width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) {
return -1;
}
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-#if defined(HAS_SETROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- SetRows32_NEON(dst, value, width, dst_stride_argb, height);
- return 0;
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+
+#if defined(HAS_ARGBSETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSetRow = ARGBSetRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_NEON;
+ }
}
#endif
-#if defined(HAS_SETROW_X86)
+#if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
- SetRows32_X86(dst, value, width, dst_stride_argb, height);
- return 0;
+ ARGBSetRow = ARGBSetRow_X86;
}
#endif
- SetRows32_C(dst, value, width, dst_stride_argb, height);
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ ARGBSetRow(dst_argb, value, width);
+ dst_argb += dst_stride_argb;
+ }
return 0;
}
@@ -1018,6 +1283,9 @@
int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBAttenuateRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1026,24 +1294,39 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBAttenuateRow_C;
-#if defined(HAS_ARGBATTENUATE_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
@@ -1056,6 +1339,9 @@
int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1064,17 +1350,32 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBUnattenuateRow_C;
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
}
#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Neon version.
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBUnattenuateRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
@@ -1087,6 +1388,9 @@
int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1095,17 +1399,25 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
@@ -1118,19 +1430,30 @@
int ARGBGray(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y,
int width, int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBGRAYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
dst += dst_stride_argb;
}
@@ -1141,78 +1464,182 @@
LIBYUV_API
int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
- void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBSEPIAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBSEPIAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
dst += dst_stride_argb;
}
return 0;
}
-// Apply a 4x3 matrix rotation to each ARGB pixel.
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
const int8* matrix_argb,
- int dst_x, int dst_y, int width, int height) {
- if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
+ int width, int height) {
+ int y;
+ void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+ if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
return -1;
}
- void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
- int width) = ARGBColorMatrixRow_C;
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- for (int y = 0; y < height; ++y) {
- ARGBColorMatrixRow(dst, matrix_argb, width);
- dst += dst_stride_argb;
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
}
return 0;
}
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_rgb,
+ int dst_x, int dst_y, int width, int height) {
+ SIMD_ALIGNED(int8 matrix_argb[16]);
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+
+ // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+ matrix_argb[0] = matrix_rgb[0] / 2;
+ matrix_argb[1] = matrix_rgb[1] / 2;
+ matrix_argb[2] = matrix_rgb[2] / 2;
+ matrix_argb[3] = matrix_rgb[3] / 2;
+ matrix_argb[4] = matrix_rgb[4] / 2;
+ matrix_argb[5] = matrix_rgb[5] / 2;
+ matrix_argb[6] = matrix_rgb[6] / 2;
+ matrix_argb[7] = matrix_rgb[7] / 2;
+ matrix_argb[8] = matrix_rgb[8] / 2;
+ matrix_argb[9] = matrix_rgb[9] / 2;
+ matrix_argb[10] = matrix_rgb[10] / 2;
+ matrix_argb[11] = matrix_rgb[11] / 2;
+ matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+ matrix_argb[15] = 64; // 1.0
+
+ return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+ dst, dst_stride_argb,
+ &matrix_argb[0], width, height);
+}
+
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
const uint8* table_argb,
int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = ARGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
dst_x < 0 || dst_y < 0) {
return -1;
}
- void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
- int width) = ARGBColorTableRow_C;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBCOLORTABLEROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
ARGBColorTableRow = ARGBColorTableRow_X86;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBColorTableRow(dst, table_argb, width);
dst += dst_stride_argb;
}
return 0;
}
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = RGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ RGBColorTableRow = RGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ RGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
// ARGBQuantize is used to posterize art.
// e.g. rgb / qvalue * qvalue + qvalue / 2
// But the low levels implement efficiently with 3 parameters, and could be
// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
// The divide is replaces with a multiply by reciprocal fixed point multiply.
// Caveat - although SSE2 saturates, the C function does not and should be used
// with care if doing anything but quantization.
@@ -1220,20 +1647,31 @@
int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
int scale, int interval_size, int interval_offset,
int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
interval_size < 1 || interval_size > 255) {
return -1;
}
- void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) = ARGBQuantizeRow_C;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- for (int y = 0; y < height; ++y) {
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
dst += dst_stride_argb;
}
@@ -1246,19 +1684,20 @@
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ int32* previous_cumsum = dst_cumsum;
if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
return -1;
}
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
- int32* previous_cumsum = dst_cumsum;
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
previous_cumsum = dst_cumsum;
dst_cumsum += dst_stride32_cumsum;
@@ -1276,17 +1715,36 @@
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+ int32* cumsum_bot_row;
+ int32* max_cumsum_bot_row;
+ int32* cumsum_top_row;
+
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
- void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
-#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (radius > height) {
+ radius = height;
+ }
+ if (radius > (width / 2 - 1)) {
+ radius = width / 2 - 1;
+ }
+ if (radius <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
- CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
+ CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
// Compute enough CumulativeSum for first row to be blurred. After this
@@ -1296,16 +1754,18 @@
width, radius);
src_argb = src_argb + radius * src_stride_argb;
- int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+ cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
- const int32* max_cumsum_bot_row =
- &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
- const int32* cumsum_top_row = &dst_cumsum[0];
+ max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ cumsum_top_row = &dst_cumsum[0];
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
int area = radius * (bot_y - top_y);
+ int boxwidth = radius * 4;
+ int x;
+ int n;
// Increment cumsum_top_row pointer with circular buffer wrap around.
if (top_y) {
@@ -1328,27 +1788,25 @@
}
// Left clipped.
- int boxwidth = radius * 4;
- int x;
for (x = 0; x < radius + 1; ++x) {
- CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], 1);
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], 1);
area += (bot_y - top_y);
boxwidth += 4;
}
// Middle unclipped.
- int n = (width - 1) - radius - x + 1;
- CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], n);
+ n = (width - 1) - radius - x + 1;
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], n);
// Right clipped.
for (x += n; x <= width - 1; ++x) {
area -= (bot_y - top_y);
boxwidth -= 4;
- CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
- cumsum_bot_row + (x - radius - 1) * 4,
- boxwidth, area, &dst_argb[x * 4], 1);
+ CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4,
+ boxwidth, area, &dst_argb[x * 4], 1);
}
dst_argb += dst_stride_argb;
}
@@ -1360,6 +1818,9 @@
int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value) {
+ int y;
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, uint32 value) = ARGBShadeRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
return -1;
}
@@ -1368,17 +1829,25 @@
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
- int width, uint32 value) = ARGBShadeRow_C;
-#if defined(HAS_ARGBSHADE_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_NEON;
+ }
+#endif
- for (int y = 0; y < height; ++y) {
+ for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
@@ -1386,42 +1855,816 @@
return 0;
}
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+ const uint8* src1, int src_stride1,
+ uint8* dst, int dst_stride,
+ int width, int height, int interpolation) {
+ int y;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst = dst + (height - 1) * dst_stride;
+ dst_stride = -dst_stride;
+ }
+ // Coalesce rows.
+ if (src_stride0 == width &&
+ src_stride1 == width &&
+ dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride0 = src_stride1 = dst_stride = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
+ IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
+ IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
+ IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+ src0 += src_stride0;
+ src1 += src_stride1;
+ dst += dst_stride;
+ }
+ return 0;
+}
+
// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation) {
- if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return InterpolatePlane(src_argb0, src_stride_argb0,
+ src_argb1, src_stride_argb1,
+ dst_argb, dst_stride_argb,
+ width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+ const uint8* src0_u, int src0_stride_u,
+ const uint8* src0_v, int src0_stride_v,
+ const uint8* src1_y, int src1_stride_y,
+ const uint8* src1_u, int src1_stride_u,
+ const uint8* src1_v, int src1_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height, int interpolation) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src0_y || !src0_u || !src0_v ||
+ !src1_y || !src1_u || !src1_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ InterpolatePlane(src0_y, src0_stride_y,
+ src1_y, src1_stride_y,
+ dst_y, dst_stride_y,
+ width, height, interpolation);
+ InterpolatePlane(src0_u, src0_stride_u,
+ src1_u, src1_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight, interpolation);
+ InterpolatePlane(src0_v, src0_stride_v,
+ src1_v, src1_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight, interpolation);
+ return 0;
+}
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* shuffler, int width, int height) {
+ int y;
+ void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+ const uint8* shuffler, int width) = ARGBShuffleRow_C;
+ if (!src_bgra || !dst_argb ||
+ width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
}
- void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) = ARGBInterpolateRow_C;
-#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
- IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
+ // Coalesce rows.
+ if (src_stride_bgra == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_bgra = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSE2;
+ }
}
#endif
- for (int y = 0; y < height; ++y) {
- ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
- width, interpolation);
- src_argb0 += src_stride_argb0;
- src_argb1 += src_stride_argb1;
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+ src_bgra += src_stride_bgra;
dst_argb += dst_stride_argb;
}
return 0;
}
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ void (*SobelRow)(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst, int width)) {
+ int y;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+ ARGBToYJRow_C;
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) = SobelYRow_C;
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ {
+ // 3 rows with edges before/after.
+ const int kRowSize = (width + kEdge + 31) & ~31;
+ align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+ uint8* row_sobelx = rows;
+ uint8* row_sobely = rows + kRowSize;
+ uint8* row_y = rows + kRowSize * 2;
+
+ // Convert first row.
+ uint8* row_y0 = row_y + kEdge;
+ uint8* row_y1 = row_y0 + kRowSize;
+ uint8* row_y2 = row_y1 + kRowSize;
+ ARGBToYJRow(src_argb, row_y0, width);
+ row_y0[-1] = row_y0[0];
+ memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
+ ARGBToYJRow(src_argb, row_y1, width);
+ row_y1[-1] = row_y1[0];
+ memset(row_y1 + width, row_y1[width - 1], 16);
+ memset(row_y2 + width, 0, 16);
+
+ for (y = 0; y < height; ++y) {
+ // Convert next row of ARGB to G.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToYJRow(src_argb, row_y2, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ {
+ uint8* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+ }
+
+ dst_argb += dst_stride_argb;
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelRow = SobelRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelRow = SobelRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+ width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXYRow = SobelXYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXYRow = SobelXYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const float* poly,
+ int width, int height) {
+ int y;
+ void (*ARGBPolynomialRow)(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) = ARGBPolynomialRow_C;
+ if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* luma,
+ int width, int height) {
+ int y;
+ void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, const uint8* luma, const uint32 lumacoeff) =
+ ARGBLumaColorTableRow_C;
+ if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBCopyAlphaRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyAlphaRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
+ uint8* dst_a, int dst_stride,
+ int width, int height) {
+ if (!src_argb || !dst_a || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb += (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ // Coalesce rows.
+ if (src_stride == width * 4 && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride = dst_stride = 0;
+ }
+ void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
+ ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+ : ARGBExtractAlphaRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+ : ARGBExtractAlphaRow_Any_NEON;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBExtractAlphaRow(src_argb, dst_a, width);
+ src_argb += src_stride;
+ dst_a += dst_stride;
+ }
+ return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ ARGBCopyYToAlphaRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_yuy2 ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_uyvy ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index cac3fa0..01ea5c4 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -13,6 +13,7 @@
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -20,785 +21,46 @@
extern "C" {
#endif
-#if !defined(YUV_DISABLE_ASM) && \
- (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#if defined(__APPLE__) && defined(__i386__)
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".private_extern _" #name " \n" \
- ".align 4,0x90 \n" \
-"_" #name ": \n"
-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".align 4,0x90 \n" \
-"_" #name ": \n"
-#else
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".align 4,0x90 \n" \
-#name ": \n"
-#endif
-#endif
-
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorRowUV_NEON(const uint8* src,
- uint8* dst_a, uint8* dst_b,
- int width);
-#define HAS_TRANSPOSE_WX8_NEON
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWX8_NEON
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width);
-#endif // defined(__ARM_NEON__)
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- __asm {
- push edi
- push esi
- push ebp
- mov eax, [esp + 12 + 4] // src
- mov edi, [esp + 12 + 8] // src_stride
- mov edx, [esp + 12 + 12] // dst
- mov esi, [esp + 12 + 16] // dst_stride
- mov ecx, [esp + 12 + 20] // width
-
- // Read in the data from the source pointer.
- // First round of bit swap.
- align 16
- convertloop:
- movq xmm0, qword ptr [eax]
- lea ebp, [eax + 8]
- movq xmm1, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm0, xmm1
- movq xmm2, qword ptr [eax]
- movdqa xmm1, xmm0
- palignr xmm1, xmm1, 8
- movq xmm3, qword ptr [eax + edi]
- lea eax, [eax + 2 * edi]
- punpcklbw xmm2, xmm3
- movdqa xmm3, xmm2
- movq xmm4, qword ptr [eax]
- palignr xmm3, xmm3, 8
- movq xmm5, qword ptr [eax + edi]
- punpcklbw xmm4, xmm5
- lea eax, [eax + 2 * edi]
- movdqa xmm5, xmm4
- movq xmm6, qword ptr [eax]
- palignr xmm5, xmm5, 8
- movq xmm7, qword ptr [eax + edi]
- punpcklbw xmm6, xmm7
- mov eax, ebp
- movdqa xmm7, xmm6
- palignr xmm7, xmm7, 8
- // Second round of bit swap.
- punpcklwd xmm0, xmm2
- punpcklwd xmm1, xmm3
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- palignr xmm2, xmm2, 8
- palignr xmm3, xmm3, 8
- punpcklwd xmm4, xmm6
- punpcklwd xmm5, xmm7
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- palignr xmm6, xmm6, 8
- palignr xmm7, xmm7, 8
- // Third round of bit swap.
- // Write to the destination pointer.
- punpckldq xmm0, xmm4
- movq qword ptr [edx], xmm0
- movdqa xmm4, xmm0
- palignr xmm4, xmm4, 8
- movq qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- punpckldq xmm2, xmm6
- movdqa xmm6, xmm2
- palignr xmm6, xmm6, 8
- movq qword ptr [edx], xmm2
- punpckldq xmm1, xmm5
- movq qword ptr [edx + esi], xmm6
- lea edx, [edx + 2 * esi]
- movdqa xmm5, xmm1
- movq qword ptr [edx], xmm1
- palignr xmm5, xmm5, 8
- punpckldq xmm3, xmm7
- movq qword ptr [edx + esi], xmm5
- lea edx, [edx + 2 * esi]
- movq qword ptr [edx], xmm3
- movdqa xmm7, xmm3
- palignr xmm7, xmm7, 8
- sub ecx, 8
- movq qword ptr [edx + esi], xmm7
- lea edx, [edx + 2 * esi]
- jg convertloop
-
- pop ebp
- pop esi
- pop edi
- ret
- }
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w) {
- __asm {
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp + 16 + 4] // src
- mov edi, [esp + 16 + 8] // src_stride
- mov edx, [esp + 16 + 12] // dst_a
- mov esi, [esp + 16 + 16] // dst_stride_a
- mov ebx, [esp + 16 + 20] // dst_b
- mov ebp, [esp + 16 + 24] // dst_stride_b
- mov ecx, esp
- sub esp, 4 + 16
- and esp, ~15
- mov [esp + 16], ecx
- mov ecx, [ecx + 16 + 28] // w
-
- align 16
- convertloop:
- // Read in the data from the source pointer.
- // First round of bit swap.
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm0 // use xmm7 as temp register.
- punpcklbw xmm0, xmm1
- punpckhbw xmm7, xmm1
- movdqa xmm1, xmm7
- movdqa xmm2, [eax]
- movdqa xmm3, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm2
- punpcklbw xmm2, xmm3
- punpckhbw xmm7, xmm3
- movdqa xmm3, xmm7
- movdqa xmm4, [eax]
- movdqa xmm5, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa xmm7, xmm4
- punpcklbw xmm4, xmm5
- punpckhbw xmm7, xmm5
- movdqa xmm5, xmm7
- movdqa xmm6, [eax]
- movdqa xmm7, [eax + edi]
- lea eax, [eax + 2 * edi]
- movdqa [esp], xmm5 // backup xmm5
- neg edi
- movdqa xmm5, xmm6 // use xmm5 as temp register.
- punpcklbw xmm6, xmm7
- punpckhbw xmm5, xmm7
- movdqa xmm7, xmm5
- lea eax, [eax + 8 * edi + 16]
- neg edi
- // Second round of bit swap.
- movdqa xmm5, xmm0
- punpcklwd xmm0, xmm2
- punpckhwd xmm5, xmm2
- movdqa xmm2, xmm5
- movdqa xmm5, xmm1
- punpcklwd xmm1, xmm3
- punpckhwd xmm5, xmm3
- movdqa xmm3, xmm5
- movdqa xmm5, xmm4
- punpcklwd xmm4, xmm6
- punpckhwd xmm5, xmm6
- movdqa xmm6, xmm5
- movdqa xmm5, [esp] // restore xmm5
- movdqa [esp], xmm6 // backup xmm6
- movdqa xmm6, xmm5 // use xmm6 as temp register.
- punpcklwd xmm5, xmm7
- punpckhwd xmm6, xmm7
- movdqa xmm7, xmm6
- // Third round of bit swap.
- // Write to the destination pointer.
- movdqa xmm6, xmm0
- punpckldq xmm0, xmm4
- punpckhdq xmm6, xmm4
- movdqa xmm4, xmm6
- movdqa xmm6, [esp] // restore xmm6
- movlpd qword ptr [edx], xmm0
- movhpd qword ptr [ebx], xmm0
- movlpd qword ptr [edx + esi], xmm4
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm4
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm2 // use xmm0 as the temp register.
- punpckldq xmm2, xmm6
- movlpd qword ptr [edx], xmm2
- movhpd qword ptr [ebx], xmm2
- punpckhdq xmm0, xmm6
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm1 // use xmm0 as the temp register.
- punpckldq xmm1, xmm5
- movlpd qword ptr [edx], xmm1
- movhpd qword ptr [ebx], xmm1
- punpckhdq xmm0, xmm5
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm3 // use xmm0 as the temp register.
- punpckldq xmm3, xmm7
- movlpd qword ptr [edx], xmm3
- movhpd qword ptr [ebx], xmm3
- punpckhdq xmm0, xmm7
- sub ecx, 8
- movlpd qword ptr [edx + esi], xmm0
- lea edx, [edx + 2 * esi]
- movhpd qword ptr [ebx + ebp], xmm0
- lea ebx, [ebx + 2 * ebp]
- jg convertloop
-
- mov esp, [esp + 16]
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
-#define HAS_TRANSPOSE_WX8_SSSE3
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 4 \n"
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory", "cc"
- #if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- #endif
- );
-}
-
-#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
-#define HAS_TRANSPOSE_UVWX8_SSE2
-extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w);
- asm (
- DECLARE_FUNCTION(TransposeUVWx8_SSE2)
- "push %ebx \n"
- "push %esi \n"
- "push %edi \n"
- "push %ebp \n"
- "mov 0x14(%esp),%eax \n"
- "mov 0x18(%esp),%edi \n"
- "mov 0x1c(%esp),%edx \n"
- "mov 0x20(%esp),%esi \n"
- "mov 0x24(%esp),%ebx \n"
- "mov 0x28(%esp),%ebp \n"
- "mov %esp,%ecx \n"
- "sub $0x14,%esp \n"
- "and $0xfffffff0,%esp \n"
- "mov %ecx,0x10(%esp) \n"
- "mov 0x2c(%ecx),%ecx \n"
-
-"1: \n"
- "movdqa (%eax),%xmm0 \n"
- "movdqa (%eax,%edi,1),%xmm1 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm0,%xmm7 \n"
- "punpcklbw %xmm1,%xmm0 \n"
- "punpckhbw %xmm1,%xmm7 \n"
- "movdqa %xmm7,%xmm1 \n"
- "movdqa (%eax),%xmm2 \n"
- "movdqa (%eax,%edi,1),%xmm3 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm2,%xmm7 \n"
- "punpcklbw %xmm3,%xmm2 \n"
- "punpckhbw %xmm3,%xmm7 \n"
- "movdqa %xmm7,%xmm3 \n"
- "movdqa (%eax),%xmm4 \n"
- "movdqa (%eax,%edi,1),%xmm5 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm4,%xmm7 \n"
- "punpcklbw %xmm5,%xmm4 \n"
- "punpckhbw %xmm5,%xmm7 \n"
- "movdqa %xmm7,%xmm5 \n"
- "movdqa (%eax),%xmm6 \n"
- "movdqa (%eax,%edi,1),%xmm7 \n"
- "lea (%eax,%edi,2),%eax \n"
- "movdqa %xmm5,(%esp) \n"
- "neg %edi \n"
- "movdqa %xmm6,%xmm5 \n"
- "punpcklbw %xmm7,%xmm6 \n"
- "punpckhbw %xmm7,%xmm5 \n"
- "movdqa %xmm5,%xmm7 \n"
- "lea 0x10(%eax,%edi,8),%eax \n"
- "neg %edi \n"
- "movdqa %xmm0,%xmm5 \n"
- "punpcklwd %xmm2,%xmm0 \n"
- "punpckhwd %xmm2,%xmm5 \n"
- "movdqa %xmm5,%xmm2 \n"
- "movdqa %xmm1,%xmm5 \n"
- "punpcklwd %xmm3,%xmm1 \n"
- "punpckhwd %xmm3,%xmm5 \n"
- "movdqa %xmm5,%xmm3 \n"
- "movdqa %xmm4,%xmm5 \n"
- "punpcklwd %xmm6,%xmm4 \n"
- "punpckhwd %xmm6,%xmm5 \n"
- "movdqa %xmm5,%xmm6 \n"
- "movdqa (%esp),%xmm5 \n"
- "movdqa %xmm6,(%esp) \n"
- "movdqa %xmm5,%xmm6 \n"
- "punpcklwd %xmm7,%xmm5 \n"
- "punpckhwd %xmm7,%xmm6 \n"
- "movdqa %xmm6,%xmm7 \n"
- "movdqa %xmm0,%xmm6 \n"
- "punpckldq %xmm4,%xmm0 \n"
- "punpckhdq %xmm4,%xmm6 \n"
- "movdqa %xmm6,%xmm4 \n"
- "movdqa (%esp),%xmm6 \n"
- "movlpd %xmm0,(%edx) \n"
- "movhpd %xmm0,(%ebx) \n"
- "movlpd %xmm4,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm4,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm2,%xmm0 \n"
- "punpckldq %xmm6,%xmm2 \n"
- "movlpd %xmm2,(%edx) \n"
- "movhpd %xmm2,(%ebx) \n"
- "punpckhdq %xmm6,%xmm0 \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm1,%xmm0 \n"
- "punpckldq %xmm5,%xmm1 \n"
- "movlpd %xmm1,(%edx) \n"
- "movhpd %xmm1,(%ebx) \n"
- "punpckhdq %xmm5,%xmm0 \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "movdqa %xmm3,%xmm0 \n"
- "punpckldq %xmm7,%xmm3 \n"
- "movlpd %xmm3,(%edx) \n"
- "movhpd %xmm3,(%ebx) \n"
- "punpckhdq %xmm7,%xmm0 \n"
- "sub $0x8,%ecx \n"
- "movlpd %xmm0,(%edx,%esi,1) \n"
- "lea (%edx,%esi,2),%edx \n"
- "movhpd %xmm0,(%ebx,%ebp,1) \n"
- "lea (%ebx,%ebp,2),%ebx \n"
- "jg 1b \n"
- "mov 0x10(%esp),%esp \n"
- "pop %ebp \n"
- "pop %edi \n"
- "pop %esi \n"
- "pop %ebx \n"
- "ret \n"
-);
-#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
-#define HAS_TRANSPOSE_WX8_FAST_SSSE3
-static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 4 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqa (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqa (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqa (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqa (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqa (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
-);
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- ".p2align 4 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
- // Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(w) // %3
- : "r"(static_cast<intptr_t>(src_stride)), // %4
- "r"(static_cast<intptr_t>(dst_stride_a)), // %5
- "r"(static_cast<intptr_t>(dst_stride_b)) // %6
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9"
-);
-}
-#endif
-#endif
-
-static void TransposeWx8_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) {
- for (int i = 0; i < width; ++i) {
- dst[0] = src[0 * src_stride];
- dst[1] = src[1 * src_stride];
- dst[2] = src[2 * src_stride];
- dst[3] = src[3 * src_stride];
- dst[4] = src[4 * src_stride];
- dst[5] = src[5 * src_stride];
- dst[6] = src[6 * src_stride];
- dst[7] = src[7 * src_stride];
- ++src;
- dst += dst_stride;
- }
-}
-
-static void TransposeWxH_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- for (int i = 0; i < width; ++i) {
- for (int j = 0; j < height; ++j) {
- dst[i * dst_stride + j] = src[j * src_stride + i];
- }
- }
-}
-
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
+ int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width) = TransposeWx8_C;
-#if defined(HAS_TRANSPOSE_WX8_NEON)
+ uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
}
#endif
-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
- TransposeWx8 = TransposeWx8_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
}
#endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- TransposeWx8 = TransposeWx8_FAST_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx8 = TransposeWx8_Fast_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ if (IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeWx8 = TransposeWx8_Fast_DSPR2;
+ } else {
+ TransposeWx8 = TransposeWx8_DSPR2;
+ }
}
#endif
// Work across the source in 8x8 tiles
- int i = height;
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
src += 8 * src_stride; // Go down 8 rows.
@@ -806,7 +68,9 @@
i -= 8;
}
- TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+ if (i > 0) {
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+ }
}
LIBYUV_API
@@ -837,56 +101,74 @@
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
- void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-#if defined(HAS_MIRRORROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_NEON;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSE2;
- }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- MirrorRow = MirrorRow_SSSE3;
- }
-#endif
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
- if (width > kMaxStride) {
- return;
- }
// Swap first and last row and mirror the content. Uses a temporary row.
- SIMD_ALIGNED(uint8 row[kMaxStride]);
+ align_buffer_64(row, width);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);
int half_height = (height + 1) >> 1;
+ int y;
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+ MirrorRow = MirrorRow_DSPR2;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
// Odd height will harmlessly mirror the middle row twice.
- for (int y = 0; y < half_height; ++y) {
+ for (y = 0; y < half_height; ++y) {
MirrorRow(src, row, width); // Mirror first row into a buffer
src += src_stride;
MirrorRow(src_bot, dst, width); // Mirror last row into first row
@@ -895,44 +177,7 @@
src_bot -= src_stride;
dst_bot -= dst_stride;
}
-}
-
-static void TransposeUVWx8_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
- for (int i = 0; i < width; ++i) {
- dst_a[0] = src[0 * src_stride + 0];
- dst_b[0] = src[0 * src_stride + 1];
- dst_a[1] = src[1 * src_stride + 0];
- dst_b[1] = src[1 * src_stride + 1];
- dst_a[2] = src[2 * src_stride + 0];
- dst_b[2] = src[2 * src_stride + 1];
- dst_a[3] = src[3 * src_stride + 0];
- dst_b[3] = src[3 * src_stride + 1];
- dst_a[4] = src[4 * src_stride + 0];
- dst_b[4] = src[4 * src_stride + 1];
- dst_a[5] = src[5 * src_stride + 0];
- dst_b[5] = src[5 * src_stride + 1];
- dst_a[6] = src[6 * src_stride + 0];
- dst_b[6] = src[6 * src_stride + 1];
- dst_a[7] = src[7 * src_stride + 0];
- dst_b[7] = src[7 * src_stride + 1];
- src += 2;
- dst_a += dst_stride_a;
- dst_b += dst_stride_b;
- }
-}
-
-static void TransposeUVWxH_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
- for (int i = 0; i < width * 2; i += 2)
- for (int j = 0; j < height; ++j) {
- dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
- dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
- }
+ free_aligned_buffer_64(row);
}
LIBYUV_API
@@ -940,24 +185,32 @@
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
+ int i = height;
void (*TransposeUVWx8)(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
}
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(width, 8) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- TransposeUVWx8 = TransposeUVWx8_SSE2;
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_DSPR2;
}
#endif
// Work through the source in 8x8 tiles.
- int i = height;
while (i >= 8) {
TransposeUVWx8(src, src_stride,
dst_a, dst_stride_a,
@@ -969,10 +222,12 @@
i -= 8;
}
- TransposeUVWxH_C(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, i);
+ if (i > 0) {
+ TransposeUVWxH_C(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, i);
+ }
}
LIBYUV_API
@@ -1011,25 +266,31 @@
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
- void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
- MirrorRowUV_C;
-#if defined(HAS_MIRRORROW_UV_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRowUV = MirrorRowUV_NEON;
+ int i;
+ void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_NEON;
}
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(width, 16) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
- MirrorRowUV = MirrorRowUV_SSSE3;
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);
- for (int i = 0; i < height; ++i) {
- MirrorRowUV(src, dst_a, dst_b, width);
+ for (i = 0; i < height; ++i) {
+ MirrorUVRow(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
@@ -1037,6 +298,50 @@
}
LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@@ -1044,13 +349,13 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
- RotationMode mode) {
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
return -1;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
// Negative height means invert the image.
if (height < 0) {
@@ -1120,13 +425,13 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
- RotationMode mode) {
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
return -1;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
// Negative height means invert the image.
if (height < 0) {
diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
new file mode 100644
index 0000000..31a74c3
--- /dev/null
+++ b/files/source/rotate_any.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8* src, int src_stride, \
+ uint8* dst, int dst_stride, int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
+ } \
+ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
+ }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_DSPR2
+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8* src, int src_stride, \
+ uint8* dst_a, int dst_stride_a, \
+ uint8* dst_b, int dst_stride_b, int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
+ n); \
+ } \
+ TransposeUVWx8_C(src + n * 2, src_stride, \
+ dst_a + n * dst_stride_a, dst_stride_a, \
+ dst_b + n * dst_stride_b, dst_stride_b, r); \
+ }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_DSPR2
+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index 9c99446..787c0ad 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -22,32 +22,41 @@
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
-#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
- defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width);
+ int src_stepx, uint8* dst_ptr, int dst_width);
#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+ int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
- int src_stepx,
- uint8* dst_ptr, int dst_width);
+ int src_stepx, uint8* dst_ptr, int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+ uint8* dst, int dst_stride, int width, int height) {
+ int i;
+ int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(height, 4) && // width of dest.
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+#endif
- int src_pixel_step = src_stride / 4;
- for (int i = 0; i < width; ++i) { // column of source to row of dest.
+ for (i = 0; i < width; ++i) { // column of source to row of dest.
ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
dst += dst_stride;
src += 4;
@@ -55,8 +64,7 @@
}
void ARGBRotate90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+ uint8* dst, int dst_stride, int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@@ -66,8 +74,7 @@
}
void ARGBRotate270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+ uint8* dst, int dst_stride, int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@@ -77,60 +84,83 @@
}
void ARGBRotate180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
- ARGBMirrorRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- ARGBMirrorRow = ARGBMirrorRow_SSSE3;
- }
-#endif
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_X86)
- if (TestCpuFlag(kCpuHasX86)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
- if (width * 4 > kMaxStride) {
- return;
- }
+ uint8* dst, int dst_stride, int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
- SIMD_ALIGNED(uint8 row[kMaxStride]);
+ align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);
int half_height = (height + 1) >> 1;
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
// Odd height will harmlessly mirror the middle row twice.
- for (int y = 0; y < half_height; ++y) {
+ for (y = 0; y < half_height; ++y) {
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
- dst += dst_stride;
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src += src_stride;
+ dst += dst_stride;
src_bot -= src_stride;
dst_bot -= dst_stride;
}
+ free_aligned_buffer_64(row);
}
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
- RotationMode mode) {
+ uint8* dst_argb, int dst_stride_argb, int width, int height,
+ enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;
}
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
new file mode 100644
index 0000000..b33a9a0
--- /dev/null
+++ b/files/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ int i;
+ for (i = 0; i < width * 2; i += 2) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+ dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+ }
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
new file mode 100644
index 0000000..cbe870c
--- /dev/null
+++ b/files/source/rotate_gcc.cc
@@ -0,0 +1,368 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+ );
+}
+#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8. 64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
+ );
+}
+#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_mips.cc b/files/source/rotate_mips.cc
new file mode 100644
index 0000000..1e8ce25
--- /dev/null
+++ b/files/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "sw $s0, 0(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "sw $s1, 4(%[dst]) \n"
+ "bnez %[width], 1b \n"
+ " addu %[dst], %[dst], %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "swr $s0, 0(%[dst]) \n"
+ "swl $s0, 3(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "swr $s1, 4(%[dst]) \n"
+ "swl $s1, 7(%[dst]) \n"
+ "bnez %[width], 11b \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1"
+ );
+}
+
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm__ __volatile__ (
+ ".set noat \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+
+ "srl $AT, %[width], 0x2 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "sw $s4, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $s6, 0($s0) \n"
+ "sw $t8, 4($s0) \n"
+ "sw $s5, 0($s1) \n"
+ "sw $t1, 4($s1) \n"
+ "sw $s7, 0($s2) \n"
+ "sw $t9, 4($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 1b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "swr $s4, 0(%[dst]) \n"
+ "swl $s4, 3(%[dst]) \n"
+ "swr $t0, 4(%[dst]) \n"
+ "swl $t0, 7(%[dst]) \n"
+ "swr $s6, 0($s0) \n"
+ "swl $s6, 3($s0) \n"
+ "swr $t8, 4($s0) \n"
+ "swl $t8, 7($s0) \n"
+ "swr $s5, 0($s1) \n"
+ "swl $s5, 3($s1) \n"
+ "swr $t1, 4($s1) \n"
+ "swl $t1, 7($s1) \n"
+ "swr $s7, 0($s2) \n"
+ "swl $s7, 3($s2) \n"
+ "swr $t9, 4($s2) \n"
+ "swl $t9, 7($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 11b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ ".set at \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+ );
+}
+
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "subu $t7, $t9, %[src_stride] \n"
+ "srl $t1, %[width], 1 \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ "andi $t0, %[dst_a], 0x3 \n"
+ "andi $t8, %[dst_b], 0x3 \n"
+ "or $t0, $t0, $t8 \n"
+ "andi $t8, %[dst_stride_a], 0x3 \n"
+ "andi $s5, %[dst_stride_b], 0x3 \n"
+ "or $t8, $t8, $s5 \n"
+ "or $t0, $t0, $t8 \n"
+ "bnez $t0, 11f \n"
+ " nop \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "sw $s3, 0($s5) \n"
+ "sw $s4, 0($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "sw $s3, 0(%[dst_a]) \n"
+ "sw $s4, 0(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "sw $s3, 4($s5) \n"
+ "sw $s4, 4($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "sw $s3, 4(%[dst_a]) \n"
+ "sw $s4, 4(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 1b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+ "b 2f \n"
+ " nop \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "swr $s3, 0($s5) \n"
+ "swl $s3, 3($s5) \n"
+ "swr $s4, 0($s6) \n"
+ "swl $s4, 3($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "swr $s3, 0(%[dst_a]) \n"
+ "swl $s3, 3(%[dst_a]) \n"
+ "swr $s4, 0(%[dst_b]) \n"
+ "swl $s4, 3(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+
+ "swr $s3, 4($s5) \n"
+ "swl $s3, 7($s5) \n"
+ "swr $s4, 4($s6) \n"
+ "swl $s4, 7($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "swr $s3, 4(%[dst_a]) \n"
+ "swl $s3, 7(%[dst_a]) \n"
+ "swr $s4, 4(%[dst_b]) \n"
+ "swl $s4, 7(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 11b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r" (src),
+ [dst_a] "+r" (dst_a),
+ [dst_b] "+r" (dst_b),
+ [width] "+r" (width),
+ [src_stride] "+r" (src_stride)
+ : [dst_stride_a] "r" (dst_stride_a),
+ [dst_stride_b] "r" (dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index 49b3003..1c22b47 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -4,11 +4,12 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
@@ -17,33 +18,42 @@
extern "C" {
#endif
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
-static const uvec8 kVTbl4x4Transpose =
+static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
+ const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %4, #8 \n"
+ "sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- ".p2align 4 \n"
"1: \n"
- "mov r9, %0 \n"
+ "mov %0, %1 \n"
- "vld1.8 {d0}, [r9], %1 \n"
- "vld1.8 {d1}, [r9], %1 \n"
- "vld1.8 {d2}, [r9], %1 \n"
- "vld1.8 {d3}, [r9], %1 \n"
- "vld1.8 {d4}, [r9], %1 \n"
- "vld1.8 {d5}, [r9], %1 \n"
- "vld1.8 {d6}, [r9], %1 \n"
- "vld1.8 {d7}, [r9] \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
@@ -65,152 +75,205 @@
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vst1.8 {d1}, [r9], %3 \n"
- "vst1.8 {d0}, [r9], %3 \n"
- "vst1.8 {d3}, [r9], %3 \n"
- "vst1.8 {d2}, [r9], %3 \n"
- "vst1.8 {d5}, [r9], %3 \n"
- "vst1.8 {d4}, [r9], %3 \n"
- "vst1.8 {d7}, [r9], %3 \n"
- "vst1.8 {d6}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.8 {d1}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d3}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d5}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d7}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d6}, [%0] \n"
- "add %0, #8 \n" // src += 8
- "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
- "subs %4, #8 \n" // w -= 8
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %4, #8 \n"
+ "adds %5, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %4, #2 \n"
+ "cmp %5, #2 \n"
"blt 3f \n"
- "cmp %4, #4 \n"
+ "cmp %5, #4 \n"
"blt 2f \n"
// 4x8 block
- "mov r9, %0 \n"
- "vld1.32 {d0[0]}, [r9], %1 \n"
- "vld1.32 {d0[1]}, [r9], %1 \n"
- "vld1.32 {d1[0]}, [r9], %1 \n"
- "vld1.32 {d1[1]}, [r9], %1 \n"
- "vld1.32 {d2[0]}, [r9], %1 \n"
- "vld1.32 {d2[1]}, [r9], %1 \n"
- "vld1.32 {d3[0]}, [r9], %1 \n"
- "vld1.32 {d3[1]}, [r9] \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d3[1]}, [%0] \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vld1.8 {q3}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
- // TODO: rework shuffle above to write
- // out with 4 instead of 8 writes
- "vst1.32 {d4[0]}, [r9], %3 \n"
- "vst1.32 {d4[1]}, [r9], %3 \n"
- "vst1.32 {d5[0]}, [r9], %3 \n"
- "vst1.32 {d5[1]}, [r9] \n"
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ MEMACCESS(0)
+ "vst1.32 {d4[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d4[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d5[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d5[1]}, [%0] \n"
- "add r9, %2, #4 \n"
- "vst1.32 {d0[0]}, [r9], %3 \n"
- "vst1.32 {d0[1]}, [r9], %3 \n"
- "vst1.32 {d1[0]}, [r9], %3 \n"
- "vst1.32 {d1[1]}, [r9] \n"
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d0[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d0[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d1[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d1[1]}, [%0] \n"
- "add %0, #4 \n" // src += 4
- "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
- "subs %4, #4 \n" // w -= 4
+ "add %1, #4 \n" // src += 4
+ "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %5, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
- "cmp %4, #2 \n"
+ "cmp %5, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
- "mov r9, %0 \n"
- "vld1.16 {d0[0]}, [r9], %1 \n"
- "vld1.16 {d1[0]}, [r9], %1 \n"
- "vld1.16 {d0[1]}, [r9], %1 \n"
- "vld1.16 {d1[1]}, [r9], %1 \n"
- "vld1.16 {d0[2]}, [r9], %1 \n"
- "vld1.16 {d1[2]}, [r9], %1 \n"
- "vld1.16 {d0[3]}, [r9], %1 \n"
- "vld1.16 {d1[3]}, [r9] \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[3]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vst1.64 {d0}, [r9], %3 \n"
- "vst1.64 {d1}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.64 {d1}, [%0] \n"
- "add %0, #2 \n" // src += 2
- "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
- "subs %4, #2 \n" // w -= 2
+ "add %1, #2 \n" // src += 2
+ "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %5, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
- "vld1.8 {d0[0]}, [%0], %1 \n"
- "vld1.8 {d0[1]}, [%0], %1 \n"
- "vld1.8 {d0[2]}, [%0], %1 \n"
- "vld1.8 {d0[3]}, [%0], %1 \n"
- "vld1.8 {d0[4]}, [%0], %1 \n"
- "vld1.8 {d0[5]}, [%0], %1 \n"
- "vld1.8 {d0[6]}, [%0], %1 \n"
- "vld1.8 {d0[7]}, [%0] \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[0]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[1]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[2]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[3]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[4]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[5]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[6]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[7]}, [%1] \n"
- "vst1.64 {d0}, [%2] \n"
+ MEMACCESS(3)
+ "vst1.64 {d0}, [%3] \n"
"4: \n"
- : "+r"(src), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_stride), // %3
- "+r"(width) // %4
- : "r"(&kVTbl4x4Transpose) // %5
- : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst), // %3
+ "+r"(dst_stride), // %4
+ "+r"(width) // %5
+ : "r"(&kVTbl4x4Transpose) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3"
);
}
-static const uvec8 kVTbl4x4TransposeDi =
+static uvec8 kVTbl4x4TransposeDi =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
+ const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %6, #8 \n"
+ "sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- ".p2align 4 \n"
"1: \n"
- "mov r9, %0 \n"
+ "mov %0, %1 \n"
- "vld2.8 {d0, d1}, [r9], %1 \n"
- "vld2.8 {d2, d3}, [r9], %1 \n"
- "vld2.8 {d4, d5}, [r9], %1 \n"
- "vld2.8 {d6, d7}, [r9], %1 \n"
- "vld2.8 {d16, d17}, [r9], %1 \n"
- "vld2.8 {d18, d19}, [r9], %1 \n"
- "vld2.8 {d20, d21}, [r9], %1 \n"
- "vld2.8 {d22, d23}, [r9] \n"
+ MEMACCESS(0)
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
@@ -236,59 +299,84 @@
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vst1.8 {d2}, [r9], %3 \n"
- "vst1.8 {d0}, [r9], %3 \n"
- "vst1.8 {d6}, [r9], %3 \n"
- "vst1.8 {d4}, [r9], %3 \n"
- "vst1.8 {d18}, [r9], %3 \n"
- "vst1.8 {d16}, [r9], %3 \n"
- "vst1.8 {d22}, [r9], %3 \n"
- "vst1.8 {d20}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d6}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d18}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d16}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d22}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d20}, [%0] \n"
- "mov r9, %4 \n"
+ "mov %0, %5 \n"
- "vst1.8 {d3}, [r9], %5 \n"
- "vst1.8 {d1}, [r9], %5 \n"
- "vst1.8 {d7}, [r9], %5 \n"
- "vst1.8 {d5}, [r9], %5 \n"
- "vst1.8 {d19}, [r9], %5 \n"
- "vst1.8 {d17}, [r9], %5 \n"
- "vst1.8 {d23}, [r9], %5 \n"
- "vst1.8 {d21}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.8 {d3}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d7}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d5}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d19}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d17}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d23}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d21}, [%0] \n"
- "add %0, #8*2 \n" // src += 8*2
- "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %6, #8 \n" // w -= 8
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %7, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %6, #8 \n"
+ "adds %7, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %6, #2 \n"
+ "cmp %7, #2 \n"
"blt 3f \n"
- "cmp %6, #4 \n"
+ "cmp %7, #4 \n"
"blt 2f \n"
- //TODO(frkoenig) : clean this up
+ // TODO(frkoenig): Clean this up
// 4x8 block
- "mov r9, %0 \n"
- "vld1.64 {d0}, [r9], %1 \n"
- "vld1.64 {d1}, [r9], %1 \n"
- "vld1.64 {d2}, [r9], %1 \n"
- "vld1.64 {d3}, [r9], %1 \n"
- "vld1.64 {d4}, [r9], %1 \n"
- "vld1.64 {d5}, [r9], %1 \n"
- "vld1.64 {d6}, [r9], %1 \n"
- "vld1.64 {d7}, [r9] \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.64 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d7}, [%0] \n"
- "vld1.8 {q15}, [%7] \n"
+ MEMACCESS(8)
+ "vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
@@ -302,103 +390,142 @@
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vst1.32 {d16[0]}, [r9], %3 \n"
- "vst1.32 {d16[1]}, [r9], %3 \n"
- "vst1.32 {d17[0]}, [r9], %3 \n"
- "vst1.32 {d17[1]}, [r9], %3 \n"
+ MEMACCESS(0)
+ "vst1.32 {d16[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d16[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d17[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d17[1]}, [%0], %4 \n"
- "add r9, %2, #4 \n"
- "vst1.32 {d20[0]}, [r9], %3 \n"
- "vst1.32 {d20[1]}, [r9], %3 \n"
- "vst1.32 {d21[0]}, [r9], %3 \n"
- "vst1.32 {d21[1]}, [r9] \n"
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d20[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d20[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d21[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d21[1]}, [%0] \n"
- "mov r9, %4 \n"
+ "mov %0, %5 \n"
- "vst1.32 {d18[0]}, [r9], %5 \n"
- "vst1.32 {d18[1]}, [r9], %5 \n"
- "vst1.32 {d19[0]}, [r9], %5 \n"
- "vst1.32 {d19[1]}, [r9], %5 \n"
+ MEMACCESS(0)
+ "vst1.32 {d18[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d18[1]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d19[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d19[1]}, [%0], %6 \n"
- "add r9, %4, #4 \n"
- "vst1.32 {d22[0]}, [r9], %5 \n"
- "vst1.32 {d22[1]}, [r9], %5 \n"
- "vst1.32 {d23[0]}, [r9], %5 \n"
- "vst1.32 {d23[1]}, [r9] \n"
+ "add %0, %5, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d22[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d22[1]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d23[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d23[1]}, [%0] \n"
- "add %0, #4*2 \n" // src += 4 * 2
- "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
- "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
- "subs %6, #4 \n" // w -= 4
+ "add %1, #4*2 \n" // src += 4 * 2
+ "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %7, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
- "cmp %6, #2 \n"
+ "cmp %7, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
- "mov r9, %0 \n"
- "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
- "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
- "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
- "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
- "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
- "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
- "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
- "vld2.16 {d1[3], d3[3]}, [r9] \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
- "mov r9, %2 \n"
+ "mov %0, %3 \n"
- "vst1.64 {d0}, [r9], %3 \n"
- "vst1.64 {d2}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.64 {d2}, [%0] \n"
- "mov r9, %4 \n"
+ "mov %0, %5 \n"
- "vst1.64 {d1}, [r9], %5 \n"
- "vst1.64 {d3}, [r9] \n"
+ MEMACCESS(0)
+ "vst1.64 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.64 {d3}, [%0] \n"
- "add %0, #2*2 \n" // src += 2 * 2
- "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
- "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
- "subs %6, #2 \n" // w -= 2
+ "add %1, #2*2 \n" // src += 2 * 2
+ "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %7, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
- "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
- "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
- "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
- "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
- "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
- "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
- "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
- "vld2.8 {d0[7], d1[7]}, [%0] \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[7], d1[7]}, [%1] \n"
- "vst1.64 {d0}, [%2] \n"
- "vst1.64 {d1}, [%4] \n"
+ MEMACCESS(3)
+ "vst1.64 {d0}, [%3] \n"
+ MEMACCESS(5)
+ "vst1.64 {d1}, [%5] \n"
"4: \n"
- : "+r"(src), // %0
- "+r"(src_stride), // %1
- "+r"(dst_a), // %2
- "+r"(dst_stride_a), // %3
- "+r"(dst_b), // %4
- "+r"(dst_stride_b), // %5
- "+r"(width) // %6
- : "r"(&kVTbl4x4TransposeDi) // %7
- : "memory", "cc", "r9",
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_a), // %3
+ "+r"(dst_stride_a), // %4
+ "+r"(dst_b), // %5
+ "+r"(dst_stride_b), // %6
+ "+r"(width) // %7
+ : "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc",
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
-#endif
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
new file mode 100644
index 0000000..1ab448f
--- /dev/null
+++ b/files/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ const uint8* src_temp;
+ int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %3, %3, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.8b}, [%0] \n"
+
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v17.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v21.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v20.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v23.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v22.8b}, [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 8
+ "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %3, %3, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %3, %3, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %3, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %3, #4 \n"
+ "b.lt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.s}[3], [%0] \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(4)
+ "ld1 {v2.16b}, [%4] \n"
+
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ MEMACCESS(0)
+ "st1 {v3.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v3.s}[3], [%0] \n"
+
+ "add %0, %2, #4 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v0.s}[3], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 4
+ "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %3, %3, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %3, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v0.h}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.h}[3], [%0] \n"
+
+ "trn2 v2.8b, v0.8b, v1.8b \n"
+ "trn1 v3.8b, v0.8b, v1.8b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v3.8b}, [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v2.8b}, [%0] \n"
+
+ "add %1, %1, #2 \n" // src += 2
+ "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %3, %3, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[0], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[1], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[2], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[3], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[4], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[5], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[6], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld1 {v0.b}[7], [%1] \n"
+
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width64) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+ { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+ 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ const uint8* src_temp;
+ int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %4, %4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.16b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.16b}, [%0] \n"
+
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v16.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v17.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v17.d}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v20.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v22.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v21.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v23.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v20.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v22.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v21.d}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %4, %4, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %4, %4, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %4, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %4, #4 \n"
+ "b.lt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v1.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v3.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v4.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v5.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v6.8b}, [%0], %5 \n"
+ MEMACCESS(0)
+ "ld1 {v7.8b}, [%0] \n"
+
+ MEMACCESS(8)
+ "ld1 {v30.16b}, [%8], #16 \n"
+ "ld1 {v31.16b}, [%8] \n"
+
+ "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
+ "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
+ "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
+ "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v16.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v16.s}[3], [%0], %6 \n"
+
+ "add %0, %2, #4 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[1], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[2], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v18.s}[3], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v17.s}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[2], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v17.s}[3], [%0], %7 \n"
+
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[1], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[2], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v19.s}[3], [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 4 * 2
+ "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %4, %4, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %4, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
+ MEMACCESS(0)
+ "ld2 {v2.h, v3.h}[3], [%0] \n"
+
+ "trn1 v4.8b, v0.8b, v2.8b \n"
+ "trn2 v5.8b, v0.8b, v2.8b \n"
+ "trn1 v6.8b, v1.8b, v3.8b \n"
+ "trn2 v7.8b, v1.8b, v3.8b \n"
+
+ "mov %0, %2 \n"
+
+ MEMACCESS(0)
+ "st1 {v4.d}[0], [%0], %6 \n"
+ MEMACCESS(0)
+ "st1 {v6.d}[0], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "st1 {v5.d}[0], [%0], %7 \n"
+ MEMACCESS(0)
+ "st1 {v7.d}[0], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 2 * 2
+ "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %4, %4, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
+ MEMACCESS(1)
+ "ld2 {v0.b, v1.b}[7], [%1] \n"
+
+ MEMACCESS(2)
+ "st1 {v0.d}[0], [%2] \n"
+ MEMACCESS(3)
+ "st1 {v1.d}[0], [%3] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width64) // %4
+ : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
+ "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v30", "v31"
+ );
+}
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
new file mode 100644
index 0000000..1300fc0
--- /dev/null
+++ b/files/source/rotate_win.cc
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm {
+ push edi
+ push esi
+ push ebp
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst
+ mov esi, [esp + 12 + 16] // dst_stride
+ mov ecx, [esp + 12 + 20] // width
+
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea ebp, [eax + 8]
+ movq xmm1, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm0, xmm1
+ movq xmm2, qword ptr [eax]
+ movdqa xmm1, xmm0
+ palignr xmm1, xmm1, 8
+ movq xmm3, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ movq xmm4, qword ptr [eax]
+ palignr xmm3, xmm3, 8
+ movq xmm5, qword ptr [eax + edi]
+ punpcklbw xmm4, xmm5
+ lea eax, [eax + 2 * edi]
+ movdqa xmm5, xmm4
+ movq xmm6, qword ptr [eax]
+ palignr xmm5, xmm5, 8
+ movq xmm7, qword ptr [eax + edi]
+ punpcklbw xmm6, xmm7
+ mov eax, ebp
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm7, 8
+ // Second round of bit swap.
+ punpcklwd xmm0, xmm2
+ punpcklwd xmm1, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ palignr xmm2, xmm2, 8
+ palignr xmm3, xmm3, 8
+ punpcklwd xmm4, xmm6
+ punpcklwd xmm5, xmm7
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ palignr xmm6, xmm6, 8
+ palignr xmm7, xmm7, 8
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ punpckldq xmm0, xmm4
+ movq qword ptr [edx], xmm0
+ movdqa xmm4, xmm0
+ palignr xmm4, xmm4, 8
+ movq qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ punpckldq xmm2, xmm6
+ movdqa xmm6, xmm2
+ palignr xmm6, xmm6, 8
+ movq qword ptr [edx], xmm2
+ punpckldq xmm1, xmm5
+ movq qword ptr [edx + esi], xmm6
+ lea edx, [edx + 2 * esi]
+ movdqa xmm5, xmm1
+ movq qword ptr [edx], xmm1
+ palignr xmm5, xmm5, 8
+ punpckldq xmm3, xmm7
+ movq qword ptr [edx + esi], xmm5
+ lea edx, [edx + 2 * esi]
+ movq qword ptr [edx], xmm3
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm7, 8
+ sub ecx, 8
+ movq qword ptr [edx + esi], xmm7
+ lea edx, [edx + 2 * esi]
+ jg convertloop
+
+ pop ebp
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
+ mov edx, [esp + 16 + 12] // dst_a
+ mov esi, [esp + 16 + 16] // dst_stride_a
+ mov ebx, [esp + 16 + 20] // dst_b
+ mov ebp, [esp + 16 + 24] // dst_stride_b
+ mov ecx, esp
+ sub esp, 4 + 16
+ and esp, ~15
+ mov [esp + 16], ecx
+ mov ecx, [ecx + 16 + 28] // w
+
+ align 4
+ convertloop:
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm0 // use xmm7 as temp register.
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm7, xmm1
+ movdqa xmm1, xmm7
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm2
+ punpcklbw xmm2, xmm3
+ punpckhbw xmm7, xmm3
+ movdqa xmm3, xmm7
+ movdqu xmm4, [eax]
+ movdqu xmm5, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm4
+ punpcklbw xmm4, xmm5
+ punpckhbw xmm7, xmm5
+ movdqa xmm5, xmm7
+ movdqu xmm6, [eax]
+ movdqu xmm7, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqu [esp], xmm5 // backup xmm5
+ neg edi
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
+ punpcklbw xmm6, xmm7
+ punpckhbw xmm5, xmm7
+ movdqa xmm7, xmm5
+ lea eax, [eax + 8 * edi + 16]
+ neg edi
+ // Second round of bit swap.
+ movdqa xmm5, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm5, xmm2
+ movdqa xmm2, xmm5
+ movdqa xmm5, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm5, xmm3
+ movdqa xmm3, xmm5
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm5, xmm6
+ movdqa xmm6, xmm5
+ movdqu xmm5, [esp] // restore xmm5
+ movdqu [esp], xmm6 // backup xmm6
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
+ punpcklwd xmm5, xmm7
+ punpckhwd xmm6, xmm7
+ movdqa xmm7, xmm6
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ movdqa xmm6, xmm0
+ punpckldq xmm0, xmm4
+ punpckhdq xmm6, xmm4
+ movdqa xmm4, xmm6
+ movdqu xmm6, [esp] // restore xmm6
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [ebx], xmm0
+ movlpd qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm4
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ punpckldq xmm2, xmm6
+ movlpd qword ptr [edx], xmm2
+ movhpd qword ptr [ebx], xmm2
+ punpckhdq xmm0, xmm6
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ punpckldq xmm1, xmm5
+ movlpd qword ptr [edx], xmm1
+ movhpd qword ptr [ebx], xmm1
+ punpckhdq xmm0, xmm5
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ punpckldq xmm3, xmm7
+ movlpd qword ptr [edx], xmm3
+ movhpd qword ptr [ebx], xmm3
+ punpckhdq xmm0, xmm7
+ sub ecx, 8
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ jg convertloop
+
+ mov esp, [esp + 16]
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
new file mode 100644
index 0000000..494164f
--- /dev/null
+++ b/files/source/row_any.cc
@@ -0,0 +1,824 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h> // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ const uint8* a_buf, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I411TOARGBROW_SSSE3
+ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+#endif // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#undef ANY31C
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
+ uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
+ uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended. Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
+ T shuffler, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+ const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+ const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+ const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
+ ptrdiff_t src_stride_ptr, int width, \
+ int source_y_fraction) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_DSPR2
+ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr, r * BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+ }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8 temp[64]); \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(temp, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp, r * BPP); \
+ }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2. Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+ SIMD_ALIGNED(uint8 temp[128 * 3]); \
+ memset(temp, 0, 128); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ /* repeat last 4 bytes for 422 subsampler */ \
+ if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ /* repeat last 4 - 12 bytes for 411 subsampler */ \
+ if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
+ } \
+ if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
+ } \
+ if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_DSPR2
+ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 4]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
+ memcpy(temp + SS(r, UVSHIFT) * BPP, \
+ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
+ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
+ memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index c5f3ce0..32d2f68 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -4,13 +4,13 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
-#include <string.h> // For memcpy
+#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
@@ -19,56 +19,60 @@
extern "C" {
#endif
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- // To support in-place conversion.
- uint8 a = src_bgra[0];
- uint8 r = src_bgra[1];
- uint8 g = src_bgra[2];
- uint8 b = src_bgra[3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- dst_argb += 4;
- src_bgra += 4;
- }
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return ((-(v) >> 31) & (v));
}
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- // To support in-place conversion.
- uint8 r = src_abgr[0];
- uint8 g = src_abgr[1];
- uint8 b = src_abgr[2];
- uint8 a = src_abgr[3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- dst_argb += 4;
- src_abgr += 4;
- }
+static __inline int32 clamp255(int32 v) {
+ return (((255 - (v)) >> 31) | (v)) & 255;
}
-void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- // To support in-place conversion.
- uint8 a = src_abgr[0];
- uint8 b = src_abgr[1];
- uint8 g = src_abgr[2];
- uint8 r = src_abgr[3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- dst_argb += 4;
- src_abgr += 4;
- }
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
}
+static __inline uint32 Abs(int32 v) {
+ int m = v >> 31;
+ return (v + m) ^ m;
+}
+#else // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ return (v < 0) ? -v : v;
+}
+#endif // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+ p[0] = (uint8)(v & 255);
+ p[1] = (uint8)((v >> 8) & 255);
+ p[2] = (uint8)((v >> 16) & 255);
+ p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
uint8 b = src_rgb24[0];
uint8 g = src_rgb24[1];
uint8 r = src_rgb24[2];
@@ -82,7 +86,8 @@
}
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
@@ -95,67 +100,72 @@
}
}
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- uint8 b = src_rgb[0] & 0x1f;
- uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
- uint8 r = src_rgb[1] >> 3;
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ dst_rgb24 += 3;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
dst_argb[0] = (b << 3) | (b >> 2);
dst_argb[1] = (g << 2) | (g >> 4);
dst_argb[2] = (r << 3) | (r >> 2);
dst_argb[3] = 255u;
dst_argb += 4;
- src_rgb += 2;
+ src_rgb565 += 2;
}
}
-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- uint8 b = src_rgb[0] & 0x1f;
- uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
- uint8 r = (src_rgb[1] & 0x7c) >> 2;
- uint8 a = src_rgb[1] >> 7;
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 a = src_argb1555[1] >> 7;
dst_argb[0] = (b << 3) | (b >> 2);
dst_argb[1] = (g << 3) | (g >> 2);
dst_argb[2] = (r << 3) | (r >> 2);
dst_argb[3] = -a;
dst_argb += 4;
- src_rgb += 2;
+ src_argb1555 += 2;
}
}
-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- uint8 b = src_rgb[0] & 0x0f;
- uint8 g = src_rgb[0] >> 4;
- uint8 r = src_rgb[1] & 0x0f;
- uint8 a = src_rgb[1] >> 4;
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ uint8 a = src_argb4444[1] >> 4;
dst_argb[0] = (b << 4) | b;
dst_argb[1] = (g << 4) | g;
dst_argb[2] = (r << 4) | r;
dst_argb[3] = (a << 4) | a;
dst_argb += 4;
- src_rgb += 2;
- }
-}
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width; ++x) {
- uint8 b = src_argb[0];
- uint8 g = src_argb[1];
- uint8 r = src_argb[2];
- uint8 a = src_argb[3];
- dst_rgb[0] = a;
- dst_rgb[1] = b;
- dst_rgb[2] = g;
- dst_rgb[3] = r;
- dst_rgb += 4;
- src_argb += 4;
+ src_argb4444 += 2;
}
}
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
uint8 b = src_argb[0];
uint8 g = src_argb[1];
uint8 r = src_argb[2];
@@ -168,7 +178,8 @@
}
void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
uint8 b = src_argb[0];
uint8 g = src_argb[1];
uint8 r = src_argb[2];
@@ -180,17 +191,17 @@
}
}
-// TODO(fbarchard): support big endian CPU
void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
uint8 b0 = src_argb[0] >> 3;
uint8 g0 = src_argb[1] >> 2;
uint8 r0 = src_argb[2] >> 3;
uint8 b1 = src_argb[4] >> 3;
uint8 g1 = src_argb[5] >> 2;
uint8 r1 = src_argb[6] >> 3;
- *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
@@ -198,12 +209,47 @@
uint8 b0 = src_argb[0] >> 3;
uint8 g0 = src_argb[1] >> 2;
uint8 r0 = src_argb[2] >> 3;
- *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB. When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix. But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+ int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+ uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+ uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+ uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+ uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+ uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
}
}
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
uint8 b0 = src_argb[0] >> 3;
uint8 g0 = src_argb[1] >> 3;
uint8 r0 = src_argb[2] >> 3;
@@ -212,7 +258,7 @@
uint8 g1 = src_argb[5] >> 3;
uint8 r1 = src_argb[6] >> 3;
uint8 a1 = src_argb[7] >> 7;
- *reinterpret_cast<uint32*>(dst_rgb) =
+ *(uint32*)(dst_rgb) =
b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
(b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
dst_rgb += 4;
@@ -223,13 +269,14 @@
uint8 g0 = src_argb[1] >> 3;
uint8 r0 = src_argb[2] >> 3;
uint8 a0 = src_argb[3] >> 7;
- *reinterpret_cast<uint16*>(dst_rgb) =
+ *(uint16*)(dst_rgb) =
b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
}
}
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
uint8 b0 = src_argb[0] >> 4;
uint8 g0 = src_argb[1] >> 4;
uint8 r0 = src_argb[2] >> 4;
@@ -238,7 +285,7 @@
uint8 g1 = src_argb[5] >> 4;
uint8 r1 = src_argb[6] >> 4;
uint8 a1 = src_argb[7] >> 4;
- *reinterpret_cast<uint32*>(dst_rgb) =
+ *(uint32*)(dst_rgb) =
b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
(b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
dst_rgb += 4;
@@ -249,44 +296,46 @@
uint8 g0 = src_argb[1] >> 4;
uint8 r0 = src_argb[2] >> 4;
uint8 a0 = src_argb[3] >> 4;
- *reinterpret_cast<uint16*>(dst_rgb) =
+ *(uint16*)(dst_rgb) =
b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
}
}
static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
}
static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
- return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+ return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
- return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
+ return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
-#define MAKEROWY(NAME, R, G, B) \
+#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- for (int x = 0; x < width; ++x) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += 4; \
+ src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
uint8* dst_u, uint8* dst_v, int width) { \
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- for (int x = 0; x < width - 1; x += 2) { \
- uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
- src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
- uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
- src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
- uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
- src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
+ src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
+ src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
+ src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += 8; \
- src_rgb1 += 8; \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
@@ -299,21 +348,333 @@
} \
}
-MAKEROWY(ARGB, 2, 1, 0)
-MAKEROWY(BGRA, 1, 2, 3)
-MAKEROWY(ABGR, 0, 1, 2)
-MAKEROWY(RGBA, 3, 2, 1)
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
-// http://en.wikipedia.org/wiki/Grayscale.
-// 0.11 * B + 0.59 * G + 0.30 * R
-// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
-static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
- return (( 76 * r + 152 * g + 28 * b) >> 8);
+// JPeg uses a variation on BT.601-1 full range
+// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
+// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b 0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r 0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ b = (b << 3) | (b >> 2);
+ g = (g << 2) | (g >> 4);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_rgb565 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ b = (b << 3) | (b >> 2);
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb1555 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ b = (b << 4) | b;
+ g = (g << 4) | g;
+ r = (r << 4) | r;
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb4444 += 2;
+ dst_y += 1;
+ }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b1 = src_rgb565[2] & 0x1f;
+ uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8 r1 = src_rgb565[3] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b3 = next_rgb565[2] & 0x1f;
+ uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8 r3 = next_rgb565[3] >> 3;
+ uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 787 -> 888.
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_rgb565 += 4;
+ next_rgb565 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b = (b0 + b2); // 565 * 2 = 676.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 676 -> 888
+ g = (g << 1) | (g >> 6);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b1 = src_argb1555[2] & 0x1f;
+ uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8 b3 = next_argb1555[2] & 0x1f;
+ uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+ uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 777 -> 888.
+ g = (g << 1) | (g >> 6);
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb1555 += 4;
+ next_argb1555 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = next_argb1555[1] >> 3;
+ uint8 b = (b0 + b2); // 555 * 2 = 666.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b1 = src_argb4444[2] & 0x0f;
+ uint8 g1 = src_argb4444[2] >> 4;
+ uint8 r1 = src_argb4444[3] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b3 = next_argb4444[2] & 0x0f;
+ uint8 g3 = next_argb4444[2] >> 4;
+ uint8 r3 = next_argb4444[3] & 0x0f;
+ uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb4444 += 4;
+ next_argb4444 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b = (b0 + b2); // 444 * 2 = 555.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 3) | (b >> 2); // 555 -> 888.
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 3; x += 4) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 16;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ // Odd width handling mimics 'any' function which replicates last pixel.
+ if ((width & 3) == 3) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
}
void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
- uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]);
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = src_argb[3];
dst_argb += 4;
@@ -323,7 +684,8 @@
// Convert a row of image to Sepia tone.
void ARGBSepiaRow_C(uint8* dst_argb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
int b = dst_argb[0];
int g = dst_argb[1];
int r = dst_argb[2];
@@ -331,60 +693,44 @@
int sg = (b * 22 + g * 88 + r * 45) >> 7;
int sr = (b * 24 + g * 98 + r * 50) >> 7;
// b does not over flow. a is preserved from original.
- if (sg > 255) {
- sg = 255;
- }
- if (sr > 255) {
- sr = 255;
- }
dst_argb[0] = sb;
- dst_argb[1] = sg;
- dst_argb[2] = sr;
+ dst_argb[1] = clamp255(sg);
+ dst_argb[2] = clamp255(sr);
dst_argb += 4;
}
}
// Apply color matrix to a row of image. Matrix is signed.
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
- for (int x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- int a = dst_argb[3];
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = src_argb[0];
+ int g = src_argb[1];
+ int r = src_argb[2];
+ int a = src_argb[3];
int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
- r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+ r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
- r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+ r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
- r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
- if (sb < 0) {
- sb = 0;
- }
- if (sb > 255) {
- sb = 255;
- }
- if (sg < 0) {
- sg = 0;
- }
- if (sg > 255) {
- sg = 255;
- }
- if (sr < 0) {
- sr = 0;
- }
- if (sr > 255) {
- sr = 255;
- }
- dst_argb[0] = sb;
- dst_argb[1] = sg;
- dst_argb[2] = sr;
+ r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+ r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+ dst_argb[0] = Clamp(sb);
+ dst_argb[1] = Clamp(sg);
+ dst_argb[2] = Clamp(sr);
+ dst_argb[3] = Clamp(sa);
+ src_argb += 4;
dst_argb += 4;
}
}
// Apply color table to a row of image.
void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
int b = dst_argb[0];
int g = dst_argb[1];
int r = dst_argb[2];
@@ -397,9 +743,24 @@
}
}
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb += 4;
+ }
+}
+
void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
int b = dst_argb[0];
int g = dst_argb[1];
int r = dst_argb[2];
@@ -410,9 +771,192 @@
}
}
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ const uint32 b_scale = REPEAT8(value & 0xff);
+ const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32 a_scale = REPEAT8(value >> 24);
+
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb[0]);
+ const uint32 g = REPEAT8(src_argb[1]);
+ const uint32 r = REPEAT8(src_argb[2]);
+ const uint32 a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb0[0]);
+ const uint32 g = REPEAT8(src_argb0[1]);
+ const uint32 r = REPEAT8(src_argb0[2]);
+ const uint32 a = REPEAT8(src_argb0[3]);
+ const uint32 b_scale = src_argb1[0];
+ const uint32 g_scale = src_argb1[1];
+ const uint32 r_scale = src_argb1[2];
+ const uint32 a_scale = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_add = src_argb1[0];
+ const int g_add = src_argb1[1];
+ const int r_add = src_argb1[2];
+ const int a_add = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_add);
+ dst_argb[1] = SHADE(g, g_add);
+ dst_argb[2] = SHADE(r, r_add);
+ dst_argb[3] = SHADE(a, a_add);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_sub = src_argb1[0];
+ const int g_sub = src_argb1[1];
+ const int r_sub = src_argb1[2];
+ const int a_sub = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_sub);
+ dst_argb[1] = SHADE(g, g_sub);
+ dst_argb[2] = SHADE(r, r_sub);
+ dst_argb[3] = SHADE(a, a_sub);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+ uint8* dst_sobelx, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i];
+ int b = src_y1[i];
+ int c = src_y2[i];
+ int a_sub = src_y0[i + 2];
+ int b_sub = src_y1[i + 2];
+ int c_sub = src_y2[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobelx[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i + 0];
+ int b = src_y0[i + 1];
+ int c = src_y0[i + 2];
+ int a_sub = src_y1[i + 0];
+ int b_sub = src_y1[i + 1];
+ int c_sub = src_y1[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobely[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_argb[0] = (uint8)(s);
+ dst_argb[1] = (uint8)(s);
+ dst_argb[2] = (uint8)(s);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_y[i] = (uint8)(s);
+ }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int g = clamp255(r + b);
+ dst_argb[0] = (uint8)(b);
+ dst_argb[1] = (uint8)(g);
+ dst_argb[2] = (uint8)(r);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// Copy a Y to RGB.
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
uint8 y = src_y[0];
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = 255u;
@@ -421,250 +965,786 @@
}
}
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+// * R = Y - V * -1.28033
+// * G = Y - U * 0.21482 - V * 0.38059
+// * B = Y - U * -2.12798
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32 /* 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.12798 * 64)) */
+#define UG 14 /* round(0.21482 * 64) */
+#define VG 24 /* round(0.38059 * 64) */
+#define VR -82 /* round(-1.28033 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { UG, VG, UG, VG, UG, VG, UG, VG },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { VG, UG, VG, UG, VG, UG, VG, UG },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BB, BG, BR, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { BR, BG, BB, 0, 0, 0, 0, 0 },
+ { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static __inline uint32 Clip(int32 val) {
- if (val < 0) {
- return static_cast<uint32>(0);
- } else if (val > 255) {
- return static_cast<uint32>(255);
- }
- return static_cast<uint32>(val);
+ uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
+ *b = Clamp((int32)(-(u * ub ) + y1 + bb) >> 6);
+ *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
+ *r = Clamp((int32)(-( v * vr) + y1 + br) >> 6);
}
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
- int ashift, int rshift, int gshift, int bshift) {
- int32 y1 = (static_cast<int32>(y) - 16) * YG;
- uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
- uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
- uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
- *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
- (g << gshift) |
- (r << rshift) |
- (255u << ashift);
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(y1 + YGB) >> 6);
+ *g = Clamp((int32)(y1 + YGB) >> 6);
+ *r = Clamp((int32)(y1 + YGB) >> 6);
}
-static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
- uint8* b, uint8* g, uint8* r) {
- int32 y1 = (static_cast<int32>(y) - 16) * YG;
- *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
- *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
- *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
-}
+#undef YG
+#undef YGB
-void I444ToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width; ++x) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
- y_buf += 1;
- u_buf += 1;
- v_buf += 1;
- rgb_buf += 4; // Advance 1 pixel.
- }
-}
-
-// Also used for 420
-void I422ToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+ yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+ yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 2;
+ src_v += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-
-void I422ToRGB24Row_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
- rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
- rgb_buf += 6; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- }
-}
-
-void I422ToRAWRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
- rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
- YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
- rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
- rgb_buf += 6; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
- }
-}
-
-void I411ToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 3; x += 4) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
- YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
- YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
- y_buf += 4;
- u_buf += 1;
- v_buf += 1;
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422AlphaToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = src_a[1];
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ src_a += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ b1 = b1 >> 4;
+ g1 = g1 >> 4;
+ r1 = r1 >> 4;
+ *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ 0xf000;
+ }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 3;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ 0x8000;
+ }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 3; x += 4) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ YuvPixel(src_y[2], src_u[0], src_v[0],
+ rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
+ rgb_buf[11] = 255;
+ YuvPixel(src_y[3], src_u[0], src_v[0],
+ rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
+ rgb_buf[15] = 255;
+ src_y += 4;
+ src_u += 1;
+ src_v += 1;
rgb_buf += 16; // Advance 4 pixels.
}
if (width & 2) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
- y_buf += 2;
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-void NV12ToARGBRow_C(const uint8* y_buf,
- const uint8* uv_buf,
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
- YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
- y_buf += 2;
- uv_buf += 2;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_uv[0], src_uv[1],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_uv += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-void NV21ToARGBRow_C(const uint8* y_buf,
- const uint8* vu_buf,
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
- YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
- y_buf += 2;
- vu_buf += 2;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_vu[1], src_vu[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_vu += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-void I422ToBGRARow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_uv += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_yuy2 += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-void I422ToABGRRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
+void UYVYToARGBRow_C(const uint8* src_uyvy,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_uyvy += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
}
}
-void I422ToRGBARow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
int width) {
- for (int x = 0; x < width - 1; x += 2) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
- YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
- y_buf += 2;
- u_buf += 1;
- v_buf += 1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
}
}
-void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
- for (int x = 0; x < width; ++x) {
- YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
- y_buf += 1;
- rgb_buf += 4; // Advance 1 pixel.
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
}
}
void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+ int x;
src += width - 1;
- for (int x = 0; x < width - 1; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst[x] = src[0];
dst[x + 1] = src[-1];
src -= 2;
@@ -674,9 +1754,10 @@
}
}
-void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
src_uv += (width - 1) << 1;
- for (int x = 0; x < width - 1; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst_u[x] = src_uv[0];
dst_u[x + 1] = src_uv[-2];
dst_v[x] = src_uv[1];
@@ -690,10 +1771,11 @@
}
void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
- const uint32* src32 = reinterpret_cast<const uint32*>(src);
- uint32* dst32 = reinterpret_cast<uint32*>(dst);
+ int x;
+ const uint32* src32 = (const uint32*)(src);
+ uint32* dst32 = (uint32*)(dst);
src32 += width - 1;
- for (int x = 0; x < width - 1; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst32[x] = src32[0];
dst32[x + 1] = src32[-1];
src32 -= 2;
@@ -703,8 +1785,9 @@
}
}
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
- for (int x = 0; x < width - 1; x += 2) {
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
dst_u[x] = src_uv[0];
dst_u[x + 1] = src_uv[2];
dst_v[x] = src_uv[1];
@@ -717,29 +1800,39 @@
}
}
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x];
+ dst_uv[1] = src_v[x];
+ dst_uv[2] = src_u[x + 1];
+ dst_uv[3] = src_v[x + 1];
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1];
+ dst_uv[1] = src_v[width - 1];
+ }
+}
+
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
-void SetRow8_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
- // VC will generate rep stosb.
- for (int x = 0; x < count; ++x) {
- dst[x] = v8;
- }
-#else
- memset(dst, v8, count);
-#endif
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+ memcpy(dst, src, count * 2);
}
-void SetRows32_C(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- uint32* d = reinterpret_cast<uint32*>(dst);
- for (int x = 0; x < width; ++x) {
- d[x] = v32;
- }
- dst += dst_stride;
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+ memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+ uint32* d = (uint32*)(dst_argb);
+ int x;
+ for (x = 0; x < width; ++x) {
+ d[x] = v32;
}
}
@@ -747,7 +1840,8 @@
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
// Output a row of UV values, filtering 2 rows of YUY2.
- for (int x = 0; x < width; x += 2) {
+ int x;
+ for (x = 0; x < width; x += 2) {
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
src_yuy2 += 4;
@@ -760,7 +1854,8 @@
void YUY2ToUV422Row_C(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int width) {
// Output a row of UV values.
- for (int x = 0; x < width; x += 2) {
+ int x;
+ for (x = 0; x < width; x += 2) {
dst_u[0] = src_yuy2[1];
dst_v[0] = src_yuy2[3];
src_yuy2 += 4;
@@ -772,7 +1867,8 @@
// Copy row of YUY2 Y's (422) into Y (420/422).
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
// Output a row of Y values.
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
dst_y[x] = src_yuy2[0];
dst_y[x + 1] = src_yuy2[2];
src_yuy2 += 4;
@@ -786,7 +1882,8 @@
void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
// Output a row of UV values.
- for (int x = 0; x < width; x += 2) {
+ int x;
+ for (x = 0; x < width; x += 2) {
dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
src_uyvy += 4;
@@ -799,7 +1896,8 @@
void UYVYToUV422Row_C(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int width) {
// Output a row of UV values.
- for (int x = 0; x < width; x += 2) {
+ int x;
+ for (x = 0; x < width; x += 2) {
dst_u[0] = src_uyvy[0];
dst_v[0] = src_uyvy[2];
src_uyvy += 4;
@@ -811,7 +1909,8 @@
// Copy row of UYVY Y's (422) into Y (420/422).
void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
// Output a row of Y values.
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
dst_y[x] = src_uyvy[1];
dst_y[x + 1] = src_uyvy[3];
src_uyvy += 4;
@@ -828,7 +1927,8 @@
// This code mimics the SSSE3 version for better testability.
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
- for (int x = 0; x < width - 1; x += 2) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
uint32 fb = src_argb0[0];
uint32 fg = src_argb0[1];
uint32 fr = src_argb0[2];
@@ -872,12 +1972,32 @@
}
}
#undef BLEND
+
+#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+ src0 += 2;
+ src1 += 2;
+ alpha += 2;
+ dst += 2;
+ }
+ if (width & 1) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ }
+}
+#undef UBLEND
+
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
// Multiply source RGB by alpha and store to destination.
// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- for (int i = 0; i < width - 1; i += 2) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
uint32 b = src_argb[0];
uint32 g = src_argb[1];
uint32 r = src_argb[2];
@@ -916,10 +2036,10 @@
// g = (g * 255 + (a / 2)) / a;
// r = (r * 255 + (a / 2)) / a;
// Reciprocal method is off by 1 on some values. ie 125
-// 8.16 fixed point inverse table
-#define T(a) 0x10000 / a
-uint32 fixed_invtbl8[256] = {
- 0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
@@ -950,202 +2070,35 @@
T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
- T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
+ T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
#undef T
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
- for (int i = 0; i < width; ++i) {
+ int i;
+ for (i = 0; i < width; ++i) {
uint32 b = src_argb[0];
uint32 g = src_argb[1];
uint32 r = src_argb[2];
const uint32 a = src_argb[3];
- if (a) {
- const uint32 ia = fixed_invtbl8[a]; // 8.16 fixed point
- b = (b * ia) >> 8;
- g = (g * ia) >> 8;
- r = (r * ia) >> 8;
- // Clamping should not be necessary but is free in assembly.
- if (b > 255) {
- b = 255;
- }
- if (g > 255) {
- g = 255;
- }
- if (r > 255) {
- r = 255;
- }
- }
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
+ const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ dst_argb[0] = clamp255(b);
+ dst_argb[1] = clamp255(g);
+ dst_argb[2] = clamp255(r);
dst_argb[3] = a;
src_argb += 4;
dst_argb += 4;
}
}
-// Wrappers to handle odd width
-#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* u_buf, \
- const uint8* v_buf, \
- uint8* rgb_buf, \
- int width) { \
- int n = width & ~7; \
- I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \
- I420TORGB_C(y_buf + n, \
- u_buf + (n >> UV_SHIFT), \
- v_buf + (n >> UV_SHIFT), \
- rgb_buf + n * 4, width & 7); \
- }
-
-// Wrappers to handle odd width
-#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \
- void NAMEANY(const uint8* y_buf, \
- const uint8* uv_buf, \
- uint8* rgb_buf, \
- int width) { \
- int n = width & ~7; \
- NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \
- NV12TORGB_C(y_buf + n, \
- uv_buf + (n >> UV_SHIFT), \
- rgb_buf + n * 4, width & 7); \
- }
-
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
-Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
-Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
-#endif
-#ifdef HAS_I422TORGB24ROW_SSSE3
-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
- I422ToRGB24Row_C, 1)
-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
-#endif
-#ifdef HAS_I422TORGBAROW_SSSE3
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
-YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
-Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
-Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
-YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
-YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
-#endif
-#undef YANY
-
-#define RGBANY(NAMEANY, ARGBTORGB, BPP) \
- void NAMEANY(const uint8* argb_buf, \
- uint8* rgb_buf, \
- int width) { \
- SIMD_ALIGNED(uint8 row[kMaxStride]); \
- ARGBTORGB(argb_buf, row, width); \
- memcpy(rgb_buf, row, width * BPP); \
- }
-
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
-RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
-RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
-RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
-RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3)
-#endif
-#undef RGBANY
-
-#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \
- void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
- ARGBTOY_SSE(src_argb, dst_y, width - 16); \
- ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \
- }
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
-#endif
-#ifdef HAS_YUY2TOYROW_SSE2
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
-#endif
-#undef YANY
-
-#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \
- void NAMEANY(const uint8* src_argb, int src_stride_argb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- int n = width & ~15; \
- ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \
- ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
- dst_u + (n >> 1), \
- dst_v + (n >> 1), \
- width & 15); \
- }
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
-UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
-#endif
-#undef UVANY
-
-#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \
- void NAMEANY(const uint8* src_argb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- int n = width & ~15; \
- ANYTOUV_SSE(src_argb, dst_u, dst_v, n); \
- ANYTOUV_C(src_argb + n * BPP, \
- dst_u + (n >> 1), \
- dst_v + (n >> 1), \
- width & 15); \
- }
-
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \
- YUY2ToUV422Row_C, 2)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \
- UYVYToUV422Row_C, 2)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, \
- YUY2ToUV422Row_C, 2)
-UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, \
- UYVYToUV422Row_C, 2)
-#endif
-#undef UV422ANY
-
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width) {
int32 row_sum[4] = {0, 0, 0, 0};
- for (int x = 0; x < width; ++x) {
+ int x;
+ for (x = 0; x < width; ++x) {
row_sum[0] += row[x * 4 + 0];
row_sum[1] += row[x * 4 + 1];
row_sum[2] += row[x * 4 + 2];
@@ -1157,59 +2110,35 @@
}
}
-void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
- int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+ int w, int area, uint8* dst, int count) {
float ooa = 1.0f / area;
- for (int i = 0; i < count; ++i) {
- dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
- dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
- dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
- dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ int i;
+ for (i = 0; i < count; ++i) {
+ dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
dst += 4;
tl += 4;
bl += 4;
}
}
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- const uint32 b_scale = REPEAT8(value & 0xff);
- const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
- const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
- const uint32 a_scale = REPEAT8(value >> 24);
-
- for (int i = 0; i < width; ++i) {
- const uint32 b = REPEAT8(src_argb[0]);
- const uint32 g = REPEAT8(src_argb[1]);
- const uint32 r = REPEAT8(src_argb[2]);
- const uint32 a = REPEAT8(src_argb[3]);
- dst_argb[0] = SHADE(b, b_scale);
- dst_argb[1] = SHADE(g, g_scale);
- dst_argb[2] = SHADE(r, r_scale);
- dst_argb[3] = SHADE(a, a_scale);
- src_argb += 4;
- dst_argb += 4;
- }
-}
-#undef REPEAT8
-#undef SHADE
-
// Copy pixels from rotated source to destination row with a slope.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
+ int i;
// Render a row of pixels from source into a buffer.
float uv[2];
uv[0] = uv_dudv[0];
uv[1] = uv_dudv[1];
- for (int i = 0; i < width; ++i) {
- int x = static_cast<int>(uv[0]);
- int y = static_cast<int>(uv[1]);
- *reinterpret_cast<uint32*>(dst_argb) =
- *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
+ for (i = 0; i < width; ++i) {
+ int x = (int)(uv[0]);
+ int y = (int)(uv[1]);
+ *(uint32*)(dst_argb) =
+ *(const uint32*)(src_argb + y * src_argb_stride +
x * 4);
dst_argb += 4;
uv[0] += uv_dudv[2];
@@ -1217,29 +2146,481 @@
}
}
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
+ uint8* dst_uv, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
+ uint16* dst_uv, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
// C version 2x2 -> 2x1.
-void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction ;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
- uint8* end = dst_ptr + (dst_width << 2);
- do {
+ int x;
+ if (y1_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+ if (y1_fraction == 128) {
+ HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ dst_ptr[1] =
+ (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ int width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width * 2);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
- dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
- dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
- dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
- dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
- dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
- src_ptr += 8;
- src_ptr1 += 8;
- dst_ptr += 8;
- } while (dst_ptr < end);
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
}
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ int index0 = shuffler[0];
+ int index1 = shuffler[1];
+ int index2 = shuffler[2];
+ int index3 = shuffler[3];
+ // Shuffle a row of ARGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8 b = src_argb[index0];
+ uint8 g = src_argb[index1];
+ uint8 r = src_argb[index2];
+ uint8 a = src_argb[index3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = 0;
+ dst_frame[3] = src_v[0];
+ }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = 0;
+ }
+}
+
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float b = (float)(src_argb[0]);
+ float g = (float)(src_argb[1]);
+ float r = (float)(src_argb[2]);
+ float a = (float)(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp((int32)(db));
+ dst_argb[1] = Clamp((int32)(dg));
+ dst_argb[2] = Clamp((int32)(dr));
+ dst_argb[3] = Clamp((int32)(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uint32 bc = lumacoeff & 0xff;
+ uint32 gc = (lumacoeff >> 8) & 0xff;
+ uint32 rc = (lumacoeff >> 16) & 0xff;
+
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ const uint8* luma1;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+ src_argb[6] * rc) & 0x7F00u) + luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst_a[0] = src_argb[3];
+ dst_a[1] = src_argb[7];
+ dst_a += 2;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ dst_a[0] = src_argb[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+ defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+ // TODO(fbarchard): ARGBToRGB24Row_AVX2
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
new file mode 100644
index 0000000..1ac7ef1
--- /dev/null
+++ b/files/source/row_gcc.cc
@@ -0,0 +1,5534 @@
+// VERSION 2
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+ 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+ 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+ 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+ 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+ 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+ 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+ 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+ 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x18,0) ",%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGB24_0), // %3
+ "m"(kShuffleMaskRAWToRGB24_1), // %4
+ "m"(kShuffleMaskRAWToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
+ const uint32 dither4, int width) {
+ asm volatile (
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
+ const uint32 dither4, int width) {
+ asm volatile (
+ "vbroadcastss %3,%%xmm6 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
+ "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+ VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+ VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+ "lea " MEMLEA(0x80,0) ",%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUVJ128), // %5
+ "m"(kARGBToVJ), // %6
+ "m"(kARGBToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kAddUVJ128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6"
+ );
+}
+#endif // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444 \
+ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
+ "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
+
+// Read 2 UV from 411, upsample to 8 UV.
+// reading 4 bytes is an msan violation.
+// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
+// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
+// pinsrw fails with drmemory
+// __asm pinsrw xmm0, [esi], 0 /* U */
+// __asm pinsrw xmm1, [esi + edi], 0 /* V */
+#define READYUV411_TEMP \
+ "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
+ "movd %[temp],%%xmm0 \n" \
+ MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
+ "movd %[temp],%%xmm1 \n" \
+ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21 \
+ "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
+ "pshufb %[kShuffleNV21], %%xmm0 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2 \
+ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
+ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
+ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
+ "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY \
+ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
+ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
+ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
+ "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants) \
+ "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
+ "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
+ "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
+ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
+ "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
+ "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
+ "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa %%xmm11,%%xmm0 \n" \
+ "pmaddubsw %%xmm8,%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa %%xmm12,%%xmm1 \n" \
+ "pmaddubsw %%xmm9,%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa %%xmm13,%%xmm2 \n" \
+ "pmaddubsw %%xmm10,%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw %%xmm14,%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB_REGS \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB_REGS
+#endif
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
+ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
+ "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
+ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
+ "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+ "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ LABELALIGN
+ "1: \n"
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422ALPHATOARGBROW_SSSE3
+
+#ifdef HAS_I411TOARGBROW_SSSE3
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int temp;
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV411_TEMP
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [temp]"=&r"(temp), // %[temp]
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READNV21
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUY2
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READUYVY
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STORERGBA
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
+ "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 \
+ "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
+ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
+ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
+ "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
+ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
+ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
+ "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
+ "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
+ "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
+ "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
+ "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
+ "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
+ "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
+ "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
+ "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
+ "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
+ "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+#define YUVTORGB_REGS_AVX2 \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+#else // Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
+ "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
+ "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
+ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
+ "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
+ "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
+ "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV411_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I411TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ LABELALIGN
+ "1: \n"
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+
+ // Step 3: Weave into RGBA
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READNV21_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READYUY2_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ READUYVY_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+ asm volatile (
+ "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "psrlw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+ asm volatile (
+ "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
+ "vmovd %%eax,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+ "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
+ "vmovd %%eax,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %4,%%xmm1 \n"
+ "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "vmovdqu %3,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm5"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
+ "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+ "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+ "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
+ LABELALIGN
+ "2: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+ "9: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep movsb " MEMMOVESTRING(0,1) " \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ", %%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
+ "lea " MEMLEA(0x20, 0) ", %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8, 1) ", %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
+ "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width >> 2);
+ const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ asm volatile (
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep stosb " MEMSTORESTRING(al,0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+ VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+ VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
+ "vbroadcastss %%xmm6,%%ymm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
+ "vbroadcastss %%xmm7,%%ymm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 32 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ :: "memory", "cc", "eax",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha_AVX2) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile (
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "movzb " MEMACCESS2(0x13,0) ",%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
+ "movzb " MEMACCESS2(0x17,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
+ "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
+ MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
+
+ "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2"
+ );
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__AVX2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
+ MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
+ MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
+ MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6," MEMACCESS(2) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "movd " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
+ asm volatile (
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop \n"
+ LABELALIGN
+ "4: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* src_dudv, int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp;
+ asm volatile (
+ "movq " MEMACCESS(3) ",%%xmm2 \n"
+ "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1," MEMACCESS(2) " \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x04,2) ",%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "=&r"(temp) // %5
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+rm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vbroadcastss %%xmm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
+ "vbroadcastss %%xmm4,%%ymm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "rep movsb " MEMMOVESTRING(1,0) " \n"
+ "jmp 999f \n"
+
+ "99: \n"
+ "vzeroupper \n"
+ "999: \n"
+ : "+D"(dst_ptr), // %0
+ "+S"(src_ptr), // %1
+ "+cm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ asm volatile (
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "mov " MEMACCESS(4) ",%k2 \n"
+ "cmp $0x3000102,%k2 \n"
+ "je 3012f \n"
+ "cmp $0x10203,%k2 \n"
+ "je 123f \n"
+ "cmp $0x30201,%k2 \n"
+ "je 321f \n"
+ "cmp $0x2010003,%k2 \n"
+ "je 2103f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS(1) " \n"
+ "movzb " MEMACCESS2(0x1,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x1,1) " \n"
+ "movzb " MEMACCESS2(0x2,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x2,1) " \n"
+ "movzb " MEMACCESS2(0x3,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x3,1) " \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "123: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 123b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "321: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x39,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x39,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x39,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x39,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 321b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "2103: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x93,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x93,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x93,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x93,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 2103b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "3012: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
+ "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jg 3012b \n"
+
+ "99: \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "=&d"(pixel_temp), // %2
+ "+r"(width) // %3
+ : "r"(shuffler) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(3) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+#endif // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS(3) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
+ "addps " MEMACCESS(3) ",%%xmm0 \n"
+ "addps " MEMACCESS(3) ",%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
+ "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+ "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+ "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "vmovq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
+ MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x1,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ uintptr_t pixel_temp;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uintptr_t pixel_temp;
+ uintptr_t table_temp;
+ asm volatile (
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(2) ",%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS(2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS(3) " \n"
+ "movzb " MEMACCESS2(0x1,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x1,3) " \n"
+ "movzb " MEMACCESS2(0x2,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x2,3) " \n"
+ "movzb " MEMACCESS2(0x3,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x3,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x4,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x4,3) " \n"
+ "movzb " MEMACCESS2(0x5,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x5,3) " \n"
+ "movzb " MEMACCESS2(0x6,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x6,3) " \n"
+ "movzb " MEMACCESS2(0x7,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x7,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x8,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x8,3) " \n"
+ "movzb " MEMACCESS2(0x9,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x9,3) " \n"
+ "movzb " MEMACCESS2(0xa,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xa,3) " \n"
+ "movzb " MEMACCESS2(0xb,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xb,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb " MEMACCESS2(0xc,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xc,3) " \n"
+ "movzb " MEMACCESS2(0xd,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xd,3) " \n"
+ "movzb " MEMACCESS2(0xe,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xe,3) " \n"
+ "movzb " MEMACCESS2(0xf,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xf,3) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "lea " MEMLEA(0x10,3) ",%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
+ : "=&d"(pixel_temp), // %0
+ "=&a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_mips.cc b/files/source/row_mips.cc
new file mode 100644
index 0000000..285f0b5
--- /dev/null
+++ b/files/source/row_mips.cc
@@ -0,0 +1,782 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sgtu $v1, %[dst], $t9 \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sgtu $v1,%[dst],$t9 \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ : [count] "r" (count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+ "t8", "t9", "a3", "v1", "at"
+ );
+}
+#endif // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r" (src), [dst] "+r" (dst)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4", "t5"
+ );
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ int x;
+ int y;
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v),
+ [x] "=&r" (x),
+ [y] "=&r" (y)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t7", "t8", "t9"
+ );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define YUVTORGB \
+ "lw $t0, 0(%[y_buf]) \n" \
+ "lhu $t1, 0(%[u_buf]) \n" \
+ "lhu $t2, 0(%[v_buf]) \n" \
+ "preceu.ph.qbr $t1, $t1 \n" \
+ "preceu.ph.qbr $t2, $t2 \n" \
+ "preceu.ph.qbra $t3, $t0 \n" \
+ "preceu.ph.qbla $t0, $t0 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t3, $t3, $s4 \n" \
+ "subu.ph $t0, $t0, $s4 \n" \
+ "mul.ph $t3, $t3, $s0 \n" \
+ "mul.ph $t0, $t0, $s0 \n" \
+ "shll.ph $t4, $t1, 0x7 \n" \
+ "subu.ph $t4, $t4, $t1 \n" \
+ "mul.ph $t6, $t1, $s1 \n" \
+ "mul.ph $t1, $t2, $s2 \n" \
+ "addq_s.ph $t5, $t4, $t3 \n" \
+ "addq_s.ph $t4, $t4, $t0 \n" \
+ "shra.ph $t5, $t5, 6 \n" \
+ "shra.ph $t4, $t4, 6 \n" \
+ "addiu %[u_buf], 2 \n" \
+ "addiu %[v_buf], 2 \n" \
+ "addu.ph $t6, $t6, $t1 \n" \
+ "mul.ph $t1, $t2, $s3 \n" \
+ "addu.ph $t9, $t6, $t3 \n" \
+ "addu.ph $t8, $t6, $t0 \n" \
+ "shra.ph $t9, $t9, 6 \n" \
+ "shra.ph $t8, $t8, 6 \n" \
+ "addu.ph $t2, $t1, $t3 \n" \
+ "addu.ph $t1, $t1, $t0 \n" \
+ "shra.ph $t2, $t2, 6 \n" \
+ "shra.ph $t1, $t1, 6 \n" \
+ "subu.ph $t5, $t5, $s5 \n" \
+ "subu.ph $t4, $t4, $s5 \n" \
+ "subu.ph $t9, $t9, $s5 \n" \
+ "subu.ph $t8, $t8, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "shll_s.ph $t5, $t5, 8 \n" \
+ "shll_s.ph $t4, $t4, 8 \n" \
+ "shll_s.ph $t9, $t9, 8 \n" \
+ "shll_s.ph $t8, $t8, 8 \n" \
+ "shll_s.ph $t2, $t2, 8 \n" \
+ "shll_s.ph $t1, $t1, 8 \n" \
+ "shra.ph $t5, $t5, 8 \n" \
+ "shra.ph $t4, $t4, 8 \n" \
+ "shra.ph $t9, $t9, 8 \n" \
+ "shra.ph $t8, $t8, 8 \n" \
+ "shra.ph $t2, $t2, 8 \n" \
+ "shra.ph $t1, $t1, 8 \n" \
+ "addu.ph $t5, $t5, $s5 \n" \
+ "addu.ph $t4, $t4, $s5 \n" \
+ "addu.ph $t9, $t9, $s5 \n" \
+ "addu.ph $t8, $t8, $s5 \n" \
+ "addu.ph $t2, $t2, $s5 \n" \
+ "addu.ph $t1, $t1, $s5 \n"
+
+// TODO(fbarchard): accept yuv conversion constants.
+void I422ToARGBRow_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128| // clipping
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
+
+ "1: \n"
+ YUVTORGB
+// Arranging into argb format
+ "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
+ "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
+ "addiu %[width], -4 \n"
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
+ "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra.ph $t6, $t6, 8 \n"
+ "shra.ph $t7, $t7, 8 \n"
+ "shra.ph $t2, $t2, 8 \n"
+ "shra.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r" (dst_ptr),
+ [src_ptr1] "+r" (src_ptr1),
+ [src_ptr] "+r" (src_ptr),
+ [dst_width] "+r" (dst_width)
+ : [source_y_fraction] "r" (source_y_fraction),
+ [y0_fraction] "r" (y0_fraction),
+ [src_stride] "r" (src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+#endif // __mips_dsp_rev >= 2
+
+#endif // defined(__mips__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index 19a7833..909df06 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -16,812 +16,2826 @@
#endif
// This module is for GCC Neon
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
- "vld1.u8 {d0}, [%0]! \n" \
- "vld1.u32 {d2[0]}, [%1]! \n" \
- "vld1.u32 {d2[1]}, [%2]! \n"
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.16 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.16 {d2[1]}, [%2]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d2, d3 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
- "vld1.u8 {d0}, [%0]! \n" \
- "vld1.u8 {d2}, [%1]! \n" \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21
#define READNV21 \
- "vld1.u8 {d0}, [%0]! \n" \
- "vld1.u8 {d2}, [%1]! \n" \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
-#define YUV422TORGB \
- "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
- "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
- "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
- "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
- "vtrn.u8 d0, d1 \n" \
- "vsub.s16 q0, q0, q15 \n"/* offset y */\
- "vmul.s16 q0, q0, q14 \n" \
+// Read 8 YUY2
+#define READYUY2 \
+ MEMACCESS(0) \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ MEMACCESS(0) \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUVTORGB_SETUP \
+ MEMACCESS([kUVToRB]) \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
+ MEMACCESS([kUVToG]) \
+ "vld1.8 {d25}, [%[kUVToG]] \n" \
+ MEMACCESS([kUVBiasBGR]) \
+ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+ MEMACCESS([kUVBiasBGR]) \
+ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
+ MEMACCESS([kUVBiasBGR]) \
+ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
+ MEMACCESS([kYToRgb]) \
+ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
+
+#define YUVTORGB \
+ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
+ "vmull.u8 q9, d2, d25 \n" /* u/v G component */\
+ "vmovl.u8 q0, d0 \n" /* Y */\
+ "vmovl.s16 q10, d1 \n" \
+ "vmovl.s16 q0, d0 \n" \
+ "vmul.s32 q10, q10, q15 \n" \
+ "vmul.s32 q0, q0, q15 \n" \
+ "vqshrun.s32 d0, q0, #16 \n" \
+ "vqshrun.s32 d1, q10, #16 \n" /* Y */\
"vadd.s16 d18, d19 \n" \
- "vqadd.s16 d20, d0, d16 \n" \
- "vqadd.s16 d21, d1, d16 \n" \
- "vqadd.s16 d22, d0, d17 \n" \
- "vqadd.s16 d23, d1, d17 \n" \
- "vqadd.s16 d16, d0, d18 \n" \
- "vqadd.s16 d17, d1, d18 \n" \
- "vqrshrun.s16 d0, q10, #6 \n" \
- "vqrshrun.s16 d1, q11, #6 \n" \
- "vqrshrun.s16 d2, q8, #6 \n" \
- "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
- "vmovl.u8 q11, d1 \n" \
- "vmovl.u8 q8, d2 \n" \
- "vtrn.u8 d20, d21 \n" \
- "vtrn.u8 d22, d23 \n" \
- "vtrn.u8 d16, d17 \n" \
- "vmov.u8 d21, d16 \n"
+ "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
+ "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
+ "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
+ "vaddw.u16 q1, q1, d16 \n" \
+ "vaddw.u16 q10, q10, d17 \n" \
+ "vaddw.u16 q3, q3, d18 \n" \
+ "vqadd.s16 q8, q0, q13 \n" /* B */ \
+ "vqadd.s16 q9, q0, q14 \n" /* R */ \
+ "vqadd.s16 q0, q0, q4 \n" /* G */ \
+ "vqadd.s16 q8, q8, q1 \n" /* B */ \
+ "vqadd.s16 q9, q9, q10 \n" /* R */ \
+ "vqsub.s16 q0, q0, q3 \n" /* G */ \
+ "vqshrun.s16 d20, q8, #6 \n" /* B */ \
+ "vqshrun.s16 d22, q9, #6 \n" /* R */ \
+ "vqshrun.s16 d21, q0, #6 \n" /* G */
-#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \
- defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
-static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-#endif
-
-#ifdef HAS_I422TOARGBROW_NEON
-void I422ToARGBRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
+ YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_I422TOARGBROW_NEON
-#ifdef HAS_I422TOBGRAROW_NEON
-void I422ToBGRARow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vmov.u8 d19, #255 \n"
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-#endif // HAS_I422TOBGRAROW_NEON
-
-#ifdef HAS_I422TOABGRROW_NEON
-void I422ToABGRRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
- "1: \n"
- READYUV422
- YUV422TORGB
- "subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
+ YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_I422TOABGRROW_NEON
-#ifdef HAS_I422TORGBAROW_NEON
-void I422ToRGBARow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUVTORGB_SETUP
"1: \n"
READYUV422
- YUV422TORGB
+ YUVTORGB
+ "subs %5, %5, #8 \n"
+ MEMACCESS(3)
+ "vld1.8 {d23}, [%3]! \n"
+ MEMACCESS(4)
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ READYUV411
+ YUVTORGB
"subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_I422TORGBAROW_NEON
-#ifdef HAS_I422TORGB24ROW_NEON
-void I422ToRGB24Row_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUVTORGB_SETUP
"1: \n"
READYUV422
- YUV422TORGB
+ YUVTORGB
"subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_I422TORGB24ROW_NEON
-#ifdef HAS_I422TORAWROW_NEON
-void I422ToRAWRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+#define ARGBTORGB565 \
+ "vshll.u8 q0, d22, #8 \n" /* R */ \
+ "vshll.u8 q8, d21, #8 \n" /* G */ \
+ "vshll.u8 q9, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #5 \n" /* RG */ \
+ "vsri.16 q0, q9, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
- "vld1.u8 {d24}, [%5] \n"
- "vld1.u8 {d25}, [%6] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUVTORGB_SETUP
"1: \n"
READYUV422
- YUV422TORGB
+ YUVTORGB
"subs %4, %4, #8 \n"
- "vswp.u8 d20, d22 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
+ ARGBTORGB565
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
"+r"(width) // %4
- : "r"(&kUVToRB), // %5
- "r"(&kUVToG) // %6
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_I422TORAWROW_NEON
-#ifdef HAS_NV12TOARGBROW_NEON
-void NV12ToARGBRow_NEON(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
+#define ARGBTOARGB1555 \
+ "vshll.u8 q0, d23, #8 \n" /* A */ \
+ "vshll.u8 q8, d22, #8 \n" /* R */ \
+ "vshll.u8 q9, d21, #8 \n" /* G */ \
+ "vshll.u8 q10, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #1 \n" /* AR */ \
+ "vsri.16 q0, q9, #6 \n" /* ARG */ \
+ "vsri.16 q0, q10, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB1555
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB4444
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
int width) {
asm volatile (
- "vld1.u8 {d24}, [%4] \n"
- "vld1.u8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ READYUV400
+ YUVTORGB
+ "subs %2, %2, #8 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+ [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+ [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+ [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23"
+ );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
"1: \n"
READNV12
- YUV422TORGB
+ YUVTORGB
"subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(uv_buf), // %1
- "+r"(rgb_buf), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_NV12TOARGBROW_NEON
-#ifdef HAS_NV21TOARGBROW_NEON
-void NV21ToARGBRow_NEON(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- "vld1.u8 {d24}, [%4] \n"
- "vld1.u8 {d25}, [%5] \n"
- "vmov.u8 d26, #128 \n"
- "vmov.u16 q14, #74 \n"
- "vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
"1: \n"
READNV21
- YUV422TORGB
+ YUVTORGB
"subs %3, %3, #8 \n"
- "vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
- : "+r"(y_buf), // %0
- "+r"(uv_buf), // %1
- "+r"(rgb_buf), // %2
- "+r"(width) // %3
- : "r"(&kUVToRB), // %4
- "r"(&kUVToG) // %5
- : "cc", "memory", "q0", "q1", "q2", "q3",
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#endif // HAS_NV21TOARGBROW_NEON
-#ifdef HAS_SPLITUV_NEON
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
- ".p2align 2 \n"
+ YUVTORGB_SETUP
"1: \n"
- "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ READNV12
+ YUVTORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ READYUY2
+ YUVTORGB
+ "subs %2, %2, #8 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ READUYVY
+ YUVTORGB
+ "subs %2, %2, #8 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.u8 {q0}, [%1]! \n" // store U
- "vst1.u8 {q1}, [%2]! \n" // Store V
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store U
+ MEMACCESS(2)
+ "vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "memory", "cc", "q0", "q1" // Clobber List
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // HAS_SPLITUV_NEON
-#ifdef HAS_COPYROW_NEON
-// Copy multiple of 64
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load U
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
+ "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ :
+ "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
- "vldm %0!, {q0, q1, q2, q3} \n" // load 64
- "subs %2, %2, #64 \n" // 64 processed per loop
- "vstm %1!, {q0, q1, q2, q3} \n" // store 64
+ MEMACCESS(0)
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2 // Output registers
- : // Input registers
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // HAS_COPYROW_NEON
-#ifdef HAS_SETROW_NEON
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile ( // NOLINT
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1: \n"
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+ asm volatile (
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.u32 {q0}, [%0]! \n" // store
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "q0", "memory", "cc");
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "q0"
+ );
}
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void SetRows32_NEON(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- SetRow8_NEON(dst, v32, width << 2);
- dst += dst_stride;
- }
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0"
+ );
}
-#endif // HAS_SETROW_NEON
-#ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
- // compute where to start writing destination
- "add %1, %2 \n"
- // work on segments that are multiples of 16
- "lsrs r3, %2, #4 \n"
- // the output is written in two block. 8 bytes followed
- // by another 8. reading is done sequentially, from left to
- // right. writing is done from right to left in block sizes
- // %1, the destination pointer is incremented after writing
- // the first of the two blocks. need to subtract that 8 off
- // along with 16 to get the next location.
- "mov r3, #-24 \n"
- "beq 2f \n"
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
- // back of destination by the size of the register that is
- // going to be mirrored
- "sub %1, #16 \n"
- // the loop needs to run on blocks of 16. what will be left
- // over is either a negative number, the residuals that need
- // to be done, or 0. If this isn't subtracted off here the
- // loop will run one extra time.
- "sub %2, #16 \n"
-
- // mirror the bytes in the 64 bit segments. unable to mirror
- // the bytes in the entire 128 bits in one go.
- // because of the inability to mirror the entire 128 bits
- // mirror the writing out of the two 64 bit segments.
- ".p2align 2 \n"
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // src += 16
- "subs %2, #16 \n"
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n"
- "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
- "bge 1b \n"
-
- // add 16 back to the counter. if the result is 0 there is no
- // residuals so jump past
- "adds %2, #16 \n"
- "beq 5f \n"
- "add %1, #16 \n"
- "2: \n"
- "mov r3, #-3 \n"
- "sub %1, #2 \n"
- "subs %2, #2 \n"
- // check for 16*n+1 scenarios where segments_of_2 should not
- // be run, but there is something left over.
- "blt 4f \n"
-
-// do this in neon registers as per
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
- "3: \n"
- "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
- "subs %2, #2 \n"
- "vst1.8 {d1[0]}, [%1]! \n"
- "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
- "bge 3b \n"
-
- "adds %2, #2 \n"
- "beq 5f \n"
- "4: \n"
- "add %1, #1 \n"
- "vld1.8 {d0[0]}, [%0] \n"
- "vst1.8 {d0[0]}, [%1] \n"
- "5: \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "r3", "q0"
- );
-}
-#endif // HAS_MIRRORROW_NEON
-
-#ifdef HAS_MIRRORROWUV_NEON
-void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
- asm volatile (
- // compute where to start writing destination
- "add %1, %3 \n" // dst_a + width
- "add %2, %3 \n" // dst_b + width
- // work on input segments that are multiples of 16, but
- // width that has been passed is output segments, half
- // the size of input.
- "lsrs r12, %3, #3 \n"
- "beq 2f \n"
- // the output is written in to two blocks.
- "mov r12, #-8 \n"
- // back of destination by the size of the register that is
- // going to be mirrord
- "sub %1, #8 \n"
- "sub %2, #8 \n"
- // the loop needs to run on blocks of 8. what will be left
- // over is either a negative number, the residuals that need
- // to be done, or 0. if this isn't subtracted off here the
- // loop will run one extra time.
- "sub %3, #8 \n"
-
- // mirror the bytes in the 64 bit segments
- ".p2align 2 \n"
- "1: \n"
- "vld2.8 {d0, d1}, [%0]! \n" // src += 16
- "subs %3, #8 \n"
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
- "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
- "bge 1b \n"
-
- // add 8 back to the counter. if the result is 0 there is no
- // residuals so return
- "adds %3, #8 \n"
- "beq 4f \n"
- "add %1, #8 \n"
- "add %2, #8 \n"
- "2: \n"
- "mov r12, #-1 \n"
- "sub %1, #1 \n"
- "sub %2, #1 \n"
- "3: \n"
- "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
- "subs %3, %3, #1 \n"
- "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
- "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
- "bgt 3b \n"
- "4: \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", "r12", "q0"
- );
-}
-#endif // HAS_MIRRORROWUV_NEON
-
-#ifdef HAS_BGRATOARGBROW_NEON
-void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
- asm volatile (
- ".p2align 2 \n"
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d2 \n" // swap G, R
- "vswp.u8 d0, d3 \n" // swap B, A
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
:
- : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "r3", "q0"
);
}
-#endif // HAS_BGRATOARGBROW_NEON
-#ifdef HAS_ABGRTOARGBROW_NEON
-void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
asm volatile (
- ".p2align 2 \n"
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d0, d2 \n" // swap R, B
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ MEMACCESS(0)
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n"
"bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
:
- : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "r12", "q0"
);
}
-#endif // HAS_ABGRTOARGBROW_NEON
-#ifdef HAS_RGBATOARGBROW_NEON
-void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
- ".p2align 2 \n"
- "1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmov.u8 d4, d0 \n" // move A after RGB
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
:
- : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "r3", "q0"
);
}
-#endif // HAS_RGBATOARGBROW_NEON
-#ifdef HAS_RGB24TOARGBROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-#endif // HAS_RGB24TOARGBROW_NEON
-#ifdef HAS_RAWTOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-#endif // HAS_RAWTOARGBROW_NEON
-#ifdef HAS_ARGBTORGBAROW_NEON
-void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ MEMACCESS(0)
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmov.u8 d0, d4 \n" // move A before RGB.
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgba), // %1
- "+r"(pix) // %2
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
:
- : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "d1", "d2", "d3" // Clobber List
);
}
-#endif // HAS_ARGBTORGBAROW_NEON
-#ifdef HAS_ARGBTORGB24ROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile (
- ".p2align 2 \n"
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-#endif // HAS_ARGBTORGB24ROW_NEON
-#ifdef HAS_ARGBTORAWROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-#endif // HAS_ARGBTORAWROW_NEON
-#ifdef HAS_YUY2TOYROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
- "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "q0", "q1" // Clobber List
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // HAS_YUY2TOYROW_NEON
-#ifdef HAS_UYVYTOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
- "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(pix) // %2
+ "+r"(width) // %2
:
- : "memory", "cc", "q0", "q1" // Clobber List
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // HAS_UYVYTOYROW_NEON
-#ifdef HAS_YUY2TOYROW_NEON
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int pix) {
+ int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.u8 {d1}, [%1]! \n" // store 8 U.
- "vst1.u8 {d3}, [%2]! \n" // store 8 V.
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(pix) // %3
+ "+r"(width) // %3
:
- : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
}
-#endif // HAS_YUY2TOYROW_NEON
-#ifdef HAS_UYVYTOYROW_NEON
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int pix) {
+ int width) {
asm volatile (
- ".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.u8 {d0}, [%1]! \n" // store 8 U.
- "vst1.u8 {d2}, [%2]! \n" // store 8 V.
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(pix) // %3
+ "+r"(width) // %3
:
- : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
}
-#endif // HAS_UYVYTOYROW_NEON
-#ifdef HAS_YUY2TOYROW_NEON
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "adds %1, %0, %1 \n" // stride + src_yuy2
- ".p2align 2 \n"
+ "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.u8 {d1}, [%2]! \n" // store 8 U.
- "vst1.u8 {d3}, [%3]! \n" // store 8 V.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(pix) // %4
+ "+r"(width) // %4
:
- : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
);
}
-#endif // HAS_YUY2TOYROW_NEON
-#ifdef HAS_UYVYTOYROW_NEON
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "adds %1, %0, %1 \n" // stride + src_uyvy
- ".p2align 2 \n"
+ "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.u8 {d0}, [%2]! \n" // store 8 U.
- "vst1.u8 {d2}, [%3]! \n" // store 8 V.
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(pix) // %4
+ "+r"(width) // %4
:
- : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
);
}
-#endif // HAS_UYVYTOYROW_NEON
-#endif // __ARM_NEON__
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) {
+ asm volatile (
+ "vdup.32 d2, %2 \n" // dither4
+ "1: \n"
+ MEMACCESS(1)
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n"
+ ARGBTORGB565
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+ int width) {
+ asm volatile (
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
+ "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
+ "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
+ "vpadd.u16 d1, d8, d9 \n" // B
+ "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
+ "vpadd.u16 d3, d10, d11 \n" // G
+ "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
+ "vpadd.u16 d5, d12, d13 \n" // R
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %3, %3, #32 \n" // 32 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ MEMACCESS(0)
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ MEMACCESS(1)
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ MEMACCESS(0)
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ MEMACCESS(1)
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+ );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+ );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ // Attenuate 8 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+ );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ MEMACCESS(0)
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+ );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ // 16 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ MEMACCESS(0)
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(1)
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(2)
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ MEMACCESS(3)
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
new file mode 100644
index 0000000..6375d4f
--- /dev/null
+++ b/files/source/row_neon64.cc
@@ -0,0 +1,2809 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ MEMACCESS(1) \
+ "ld1 {v1.s}[0], [%1], #4 \n" \
+ MEMACCESS(2) \
+ "ld1 {v1.s}[1], [%2], #4 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ MEMACCESS(1) \
+ "ld1 {v2.h}[0], [%1], #2 \n" \
+ MEMACCESS(2) \
+ "ld1 {v2.h}[1], [%2], #2 \n" \
+ "zip1 v1.8b, v2.8b, v2.8b \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ MEMACCESS(1) \
+ "ld1 {v1.d}[0], [%1], #8 \n" \
+ MEMACCESS(2) \
+ "ld1 {v1.d}[1], [%2], #8 \n" \
+ "uaddlp v1.8h, v1.16b \n" \
+ "rshrn v1.8b, v1.8h, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "movi v1.8b , #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ MEMACCESS(1) \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ MEMACCESS(1) \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v3.8b, v2.8b, v2.8b \n" \
+ "uzp2 v1.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ MEMACCESS(0) \
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
+ "uzp2 v3.8b, v1.8b, v1.8b \n" \
+ "uzp1 v1.8b, v1.8b, v1.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ MEMACCESS(0) \
+ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
+ "orr v0.8b, v3.8b, v3.8b \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+#define YUVTORGB_SETUP \
+ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+#define YUVTORGB(vR, vG, vB) \
+ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
+ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
+ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
+ "ushll v0.4s, v0.4h, #0 \n" \
+ "mul v3.4s, v3.4s, v31.4s \n" \
+ "mul v0.4s, v0.4s, v31.4s \n" \
+ "sqshrun v0.4h, v0.4s, #16 \n" \
+ "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
+ "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
+ "uxtl v2.8h, v2.8b \n" \
+ "uxtl v1.8h, v1.8b \n" /* Extract U */ \
+ "mul v3.8h, v1.8h, v27.8h \n" \
+ "mul v5.8h, v1.8h, v29.8h \n" \
+ "mul v6.8h, v2.8h, v30.8h \n" \
+ "mul v7.8h, v2.8h, v28.8h \n" \
+ "sqadd v6.8h, v6.8h, v5.8h \n" \
+ "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
+ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV444
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ MEMACCESS(3)
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "subs %w5, %w5, #8 \n"
+ MEMACCESS(4)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV411
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV422
+ YUVTORGB(v23, v22, v21)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+#define ARGBTORGB565 \
+ "shll v0.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v21.8h, #5 \n" /* RG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(3)
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+#define ARGBTOARGB1555 \
+ "shll v0.8h, v23.8b, #8 \n" /* A */ \
+ "shll v22.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v22.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ ARGBTOARGB1555
+ MEMACCESS(3)
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+#define ARGBTOARGB4444 \
+ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
+ "ushr v20.8b, v20.8b, #4 \n" /* B */ \
+ "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
+ "ushr v22.8b, v22.8b, #4 \n" /* R */ \
+ "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
+ "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
+ "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
+ ARGBTOARGB4444
+ MEMACCESS(3)
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV400
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ MEMACCESS(1)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+ [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+ [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+ [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ MEMACCESS(1)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v20", "v21", "v22", "v23"
+ );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ MEMACCESS(2)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV21
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ MEMACCESS(2)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READNV12
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUY2
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ MEMACCESS(1)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READUYVY
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ MEMACCESS(1)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ MEMACCESS(2)
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ :
+ "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
+ "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+ asm volatile (
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "v0"
+ );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0"
+ );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
+ MEMACCESS(1)
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ MEMACCESS(1)
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
+ );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ // Start at end of source row.
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %w3, %w3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1"
+ );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
+ MEMACCESS(1)
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ MEMACCESS(1)
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
+ );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v4.8b, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v5.8b, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ MEMACCESS(1)
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ MEMACCESS(1)
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */ \
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ MEMACCESS(1)
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ MEMACCESS(2)
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ MEMACCESS(2)
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ MEMACCESS(3)
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v5", "v6", "v7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ MEMACCESS(3)
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(src_uyvyb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v5", "v6", "v7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ MEMACCESS(1)
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v2.8b, v1.8b, v1.8b \n"
+ MEMACCESS(1)
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ MEMACCESS(2)
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3"
+ );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
+ MEMACCESS(1)
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ MEMACCESS(2)
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3"
+ );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+ );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) {
+ asm volatile (
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ MEMACCESS(1)
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n"
+ ARGBTORGB565
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+ );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+ int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+ );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+ int width) {
+ asm volatile (
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+ );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+ );
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+ );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+ "v24", "v25", "v26", "v27", "v28", "v29"
+ );
+}
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
+ "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
+ "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
+ "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w3, %w3, #32 \n" // 32 processed per loop.
+ "mul v3.8h, v0.8h, v20.8h \n" // B
+ "mls v3.8h, v1.8h, v21.8h \n" // G
+ "mls v3.8h, v2.8h, v22.8h \n" // R
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "mul v4.8h, v2.8h, v20.8h \n" // R
+ "mls v4.8h, v1.8h, v24.8h \n" // G
+ "mls v4.8h, v0.8h, v23.8h \n" // B
+ "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
+ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
+ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ MEMACCESS(1)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_bgra_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v2.8h, v1.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_abgr_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_rgba_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+ asm volatile (
+ "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
+ "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
+ "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
+ "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
+ "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
+ "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v17.D[0] \n"
+ "ins v18.D[1], v19.D[0] \n"
+ "ins v20.D[1], v21.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v18.8h, #1 \n"
+ "urshr v6.8h, v20.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v16.8h, v4.8h, v22.8h \n" // B
+ "mls v16.8h, v5.8h, v23.8h \n" // G
+ "mls v16.8h, v6.8h, v24.8h \n" // R
+ "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
+ "mul v17.8h, v6.8h, v22.8h \n" // R
+ "mls v17.8h, v5.8h, v26.8h \n" // G
+ "mls v17.8h, v4.8h, v25.8h \n" // B
+ "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_rgb565_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+ "v25", "v26", "v27"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_argb1555_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+ "v26", "v27", "v28"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_argb4444_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+ "v26", "v27", "v28"
+
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+ "v24", "v25", "v26", "v27"
+ );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+ );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+ );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+ asm volatile (
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ MEMACCESS(0)
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ asm volatile (
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+ );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.ge 8b \n"
+
+ "89: \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ MEMACCESS(2)
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v16", "v17", "v18"
+ );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ // Attenuate 8 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+ );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ MEMACCESS(0)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+ );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ MEMACCESS(1)
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+ );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v24.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v25.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v26.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ MEMACCESS(1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+ );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+ );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v22", "v23", "v24", "v25"
+ );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+ );
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+ );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ // 16 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ MEMACCESS(2)
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1"
+ );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ MEMACCESS(2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3"
+ );
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ MEMACCESS(0)
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ MEMACCESS(1)
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ MEMACCESS(2)
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ MEMACCESS(3)
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2LL), // %5
+ "r"(6LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ MEMACCESS(1)
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ MEMACCESS(1)
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ MEMACCESS(2)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1LL), // %4
+ "r"(6LL) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc
deleted file mode 100644
index 33149da..0000000
--- a/files/source/row_posix.cc
+++ /dev/null
@@ -1,3662 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64
-#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB
-CONST vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-CONST vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
-CONST vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-// Constants for BGRA
-CONST vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
-
-CONST vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
-
-CONST vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
-
-// Constants for ABGR
-CONST vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
-
-CONST vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
-
-CONST vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
-
-CONST uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
-CONST uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RGB24 to ARGB.
-CONST uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-CONST uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
-// Shuffle table for converting ABGR to ARGB.
-CONST uvec8 kShuffleMaskABGRToARGB = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting BGRA to ARGB.
-CONST uvec8 kShuffleMaskBGRAToARGB = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-// Shuffle table for converting RGBA to ARGB.
-CONST uvec8 kShuffleMaskRGBAToARGB = {
- 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-
-// Shuffle table for converting ARGB to RGBA.
-CONST uvec8 kShuffleMaskARGBToRGBA = {
- 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
-
-// Shuffle table for converting ARGB to RGB24.
-CONST uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
-
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
-
- : "+r"(src_abgr), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskABGRToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskBGRAToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
-
- : "+r"(src_rgba), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskRGBAToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
-
- : "+r"(src_argb), // %0
- "+r"(dst_rgba), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskARGBToRGBA) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskRGB24ToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskRAWToARGB) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,(%1,%0,2) \n"
- "movdqa %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,(%1,%0,2) \n"
- "movdqa %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%1,%0,2) \n"
- "movdqa %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskARGBToRGB24) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- : "m"(kShuffleMaskARGBToRAW) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_argb))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_argb))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_bgra))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_bgra))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_abgr))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_abgr))
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-
-struct {
- vec8 kUVToB; // 0
- vec8 kUVToG; // 16
- vec8 kUVToR; // 32
- vec16 kUVBiasB; // 48
- vec16 kUVBiasG; // 64
- vec16 kUVBiasR; // 80
- vec16 kYSub16; // 96
- vec16 kYToRgb; // 112
- vec8 kVUToB; // 128
- vec8 kVUToG; // 144
- vec8 kVUToR; // 160
-} CONST SIMD_ALIGNED(kYuvConstants) = {
- { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR },
- { 16, 16, 16, 16, 16, 16, 16, 16 },
- { YG, YG, YG, YG, YG, YG, YG, YG },
- { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
-};
-
-
-// Read 8 UV from 411
-#define READYUV444 \
- "movq (%[u_buf]),%%xmm0 \n" \
- "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
-
-// Read 2 UV from 411, upsample to 8 UV
-#define READYUV411 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x2(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "punpckldq %%xmm0,%%xmm0 \n" \
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
- "movq (%[uv_buf]),%%xmm0 \n" \
- "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
-
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
- "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
- "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
- "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
- "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
- "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
- "movq (%[y_buf]),%%xmm3 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
- "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n" \
-
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
- "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
- "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
- "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
- "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
- "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
- "movq (%[y_buf]),%%xmm3 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
- "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n" \
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(vu_buf), // %[uv_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* argb_buf,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(vu_buf), // %[uv_buf]
- [argb_buf]"+r"(argb_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* bgra_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5,(%[argb_buf]) \n"
- "movdqa %%xmm0,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(bgra_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* abgr_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,(%[argb_buf]) \n"
- "movdqa %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(abgr_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* bgra_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5,(%[argb_buf]) \n"
- "movdqu %%xmm0,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(bgra_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* abgr_buf,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,(%[argb_buf]) \n"
- "movdqu %%xmm1,0x10(%[argb_buf]) \n"
- "lea 0x20(%[argb_buf]),%[argb_buf] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [argb_buf]"+r"(abgr_buf), // %[argb_buf]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_I422TOARGBROW_SSSE3
-
-#ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "mov $0x10001000,%%eax \n"
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "mov $0x012a012a,%%eax \n"
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- ".p2align 4 \n"
- "1: \n"
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
-
- // Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,16(%1) \n"
- "lea 32(%1),%1 \n"
-
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(rgb_buf), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-#endif // HAS_YTOARGBROW_SSE2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-CONST uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0,%2),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_SSE2
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile (
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0,%2),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "psllw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufd $0x4e,%%xmm0,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_SSE2
-
-#ifdef HAS_MIRRORROW_UV_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-CONST uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
- int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile (
- "movdqa %4,%%xmm1 \n"
- "lea -16(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "lea -16(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "sub $8,%3 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,(%1,%2) \n"
- "lea 8(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_MIRRORROW_UV_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-CONST uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile (
- "movdqa %3,%%xmm5 \n"
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0,%2,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kARGBShuffleMirror) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBMIRRORROW_SSSE3
-
-#ifdef HAS_SPLITUV_SSE2
-void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm2,(%1,%2) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-#endif // HAS_SPLITUV_SSE2
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
- asm volatile (
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa %%xmm0,(%0,%1) \n"
- "movdqa %%xmm1,0x10(%0,%1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-#endif // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
- size_t width_tmp = static_cast<size_t>(width);
- asm volatile (
- "shr $0x2,%2 \n"
- "rep movsl \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc"
- );
-}
-#endif // HAS_COPYROW_X86
-
-#ifdef HAS_SETROW_X86
-void SetRow8_X86(uint8* dst, uint32 v32, int width) {
- size_t width_tmp = static_cast<size_t>(width);
- asm volatile (
- "shr $0x2,%1 \n"
- "rep stosl \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
-}
-
-void SetRows32_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
- for (int y = 0; y < height; ++y) {
- size_t width_tmp = static_cast<size_t>(width);
- uint32* d = reinterpret_cast<uint32*>(dst);
- asm volatile (
- "rep stosl \n"
- : "+D"(d), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
- dst += dst_stride;
- }
-}
-#endif // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-#endif // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time.
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x1,%3 \n"
- "je 91f \n"
- "jl 99f \n"
-
- // 1 pixel loop until destination pointer is aligned.
- "10: \n"
- "test $0xf,%2 \n"
- "je 19f \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "jge 10b \n"
-
- "19: \n"
- "add $1-4,%3 \n"
- "jl 49f \n"
-
- // 4 pixel loop.
- ".p2align 2 \n"
- "41: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jge 41b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
-
- // 1 pixel loop.
- "91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-CONST uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-
-// Same as SSE2, but replaces
-// psrlw xmm3, 8 // alpha
-// pshufhw xmm3, xmm3,0F5h // 8 alpha words
-// pshuflw xmm3, xmm3,0F5h
-// with..
-// pshufb xmm3, kShuffleAlpha // alpha
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x1,%3 \n"
- "je 91f \n"
- "jl 99f \n"
-
- // 1 pixel loop until destination pointer is aligned.
- "10: \n"
- "test $0xf,%2 \n"
- "je 19f \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "jge 10b \n"
-
- "19: \n"
- "add $1-4,%3 \n"
- "jl 49f \n"
- "test $0xf,%0 \n"
- "jne 41f \n"
- "test $0xf,%1 \n"
- "jne 41f \n"
-
- // 4 pixel loop.
- ".p2align 2 \n"
- "40: \n"
- "movdqa (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqa (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqa (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jge 40b \n"
- "jmp 49f \n"
-
- // 4 pixel unaligned loop.
- ".p2align 2 \n"
- "41: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jge 41b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
-
- // 1 pixel loop.
- "91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "m"(kShuffleAlpha) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATE_SSE2
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x8,%%xmm5 \n"
-
- // 4 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pshufhw $0xff,%%xmm0,%%xmm2 \n"
- "pshuflw $0xff,%%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pshufhw $0xff,%%xmm1,%%xmm2 \n"
- "pshuflw $0xff,%%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm4,%%xmm2 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBATTENUATE_SSE2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-CONST uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-CONST uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-// Attenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- // 4 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha0), // %3
- "m"(kShuffleAlpha1) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width) {
- uintptr_t alpha = 0;
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
-
- // 4 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movzb 0x3(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x0(%4,%3,4),%%xmm2 \n"
- "movzb 0x7(%0),%3 \n"
- "movd 0x0(%4,%3,4),%%xmm3 \n"
- "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
- "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "movzb 0xb(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x0(%4,%3,4),%%xmm2 \n"
- "movzb 0xf(%0),%3 \n"
- "movd 0x0(%4,%3,4),%%xmm3 \n"
- "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
- "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
- "pand %%xmm4,%%xmm2 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "+r"(alpha) // %3
- : "r"(fixed_invtbl8) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
-CONST vec8 kARGBToGray = {
- 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
-};
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "sub %0,%1 \n"
-
- // 8 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa 0x10(%0),%%xmm3 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "movdqa %%xmm1,0x10(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kARGBToGray) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-#endif // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-// b = (r * 35 + g * 68 + b * 17) >> 7
-// g = (r * 45 + g * 88 + b * 22) >> 7
-// r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-CONST vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
-
-CONST vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
-
-CONST vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
-
- // 8 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "sub $0x8,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "movdqa %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "m"(kARGBToSepiaB), // %2
- "m"(kARGBToSepiaG), // %3
- "m"(kARGBToSepiaR) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
- int width) {
- asm volatile (
- "movd (%2),%%xmm2 \n"
- "movd 0x4(%2),%%xmm3 \n"
- "movd 0x8(%2),%%xmm4 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
-
- // 8 pixel loop.
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm6,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm5 \n"
- "psraw $0x7,%%xmm0 \n"
- "psraw $0x7,%%xmm5 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddsw %%xmm1,%%xmm5 \n"
- "psraw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "sub $0x8,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "movdqa %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(matrix_argb) // %2
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- asm volatile (
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
-
- // 4 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqa (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "sub $0x4,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
- asm volatile (
- "sub %1,%2 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
-
- // 4 pixel loop \n"
- ".p2align 2 \n"
- "40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqa (%1,%2,1),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqa 0x10(%1,%2,1),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqa 0x20(%1,%2,1),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqa 0x30(%1,%2,1),%%xmm5 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqa %%xmm2,(%1) \n"
- "movdqa %%xmm3,0x10(%1) \n"
- "movdqa %%xmm4,0x20(%1) \n"
- "movdqa %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- ".p2align 2 \n"
- "10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%1,%2,1),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
-
- "19: \n"
- : "+r"(row), // %0
- "+r"(cumsum), // %1
- "+r"(previous_cumsum), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
-void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count) {
- asm volatile (
- "movd %5,%%xmm4 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "rcpss %%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
-
- // 4 pixel loop \n"
- ".p2align 2 \n"
- "40: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "psubd (%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd (%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- ".p2align 2 \n"
- "10: \n"
- "movdqa (%0),%%xmm0 \n"
- "psubd (%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd (%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(topleft), // %0
- "+r"(botleft), // %1
- "+r"(dst), // %2
- "+rm"(count) // %3
- : "r"(static_cast<intptr_t>(width)), // %4
- "rm"(area) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
-#ifdef HAS_ARGBSHADE_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- asm volatile (
- "movd %3,%%xmm2 \n"
- "sub %0,%1 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
-
- // 4 pixel loop.
- ".p2align 2 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
-#endif // HAS_ARGBSHADE_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// TODO(fbarchard): Find 64 bit way to avoid masking.
-// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
-// Copy ARGB pixels from source image with slope to a row of destination.
-// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
-// an error if movq is used. movd %%xmm0,%1
-
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
- intptr_t src_argb_stride_temp = src_argb_stride;
- intptr_t temp = 0;
- asm volatile (
- "movq (%3),%%xmm2 \n"
- "movq 0x8(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
-
- // 4 pixel loop \n"
- ".p2align 4 \n"
- "40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "cvttps2dq %%xmm3,%%xmm1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
-#if defined(__x86_64__)
- "movd %%xmm0,%1 \n"
- "mov %1,%5 \n"
- "and $0x0fffffff,%1 \n"
- "shr $32,%5 \n"
- "pshufd $0xEE,%%xmm0,%%xmm0 \n"
-#else
- "movd %%xmm0,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-#endif
- "movd (%0,%1,1),%%xmm1 \n"
- "movd (%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
-#if defined(__x86_64__)
- "movd %%xmm0,%1 \n"
- "mov %1,%5 \n"
- "and $0x0fffffff,%1 \n"
- "shr $32,%5 \n"
-#else
- "movd %%xmm0,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%5 \n"
-#endif
- "movd (%0,%1,1),%%xmm0 \n"
- "movd (%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "sub $0x4,%4 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jge 40b \n"
-
- "49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
-
- // 1 pixel loop \n"
- ".p2align 4 \n"
- "10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%1 \n"
-#if defined(__x86_64__)
- "and $0x0fffffff,%1 \n"
-#endif
- "movd (%0,%1,1),%%xmm0 \n"
- "sub $0x1,%4 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_stride_temp), // %1
- "+r"(dst_argb), // %2
- "+r"(uv_dudv), // %3
- "+rm"(width), // %4
- "+r"(temp) // %5
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-#endif // HAS_ARGBAFFINEROW_SSE2
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 2f \n"
- "cmp $0x40,%3 \n"
- "je 3f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "2: \n"
- "movdqa (%1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- "4: \n"
- ".p2align 4 \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index e3b01f2..2a3da89 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -4,21 +4,126 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h> // For _mm_maddubs_epi16
+#endif
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// This module is for Visual C x86.
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+// 64 bit
+#if defined(_M_X64)
-// TODO(fbarchard): I420ToRGB24, I420ToRAW
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants) \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm2 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+ xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
+ xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
+ xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm0 = _mm_adds_epi16(xmm0, xmm4); \
+ xmm1 = _mm_adds_epi16(xmm1, xmm4); \
+ xmm2 = _mm_adds_epi16(xmm2, xmm4); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
+ _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
+ _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
+ dst_argb += 32;
+
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ while (width > 0) {
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ while (width > 0) {
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+// 32 bit
+#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB.
@@ -26,14 +131,33 @@
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
+static const vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
+static const vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
// Constants for BGRA.
static const vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
@@ -77,11 +201,20 @@
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
+static const uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
@@ -92,24 +225,22 @@
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
- 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
-
-// Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
- 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting ARGB to RGB24.
@@ -122,16 +253,51 @@
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
};
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+ 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+ 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+ 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+ 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+ 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+ 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+ 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+ 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- align 16
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
@@ -141,8 +307,8 @@
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -150,101 +316,48 @@
}
}
-__declspec(naked) __declspec(align(16))
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
-__asm {
- mov eax, [esp + 4] // src_bgra
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- movdqa xmm5, kShuffleMaskBGRAToARGB
- sub edx, eax
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
- align 16
- convertloop:
- movdqa xmm0, [eax]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
+ convertloop:
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhwd ymm1, ymm0, ymm0
+ vpunpcklwd ymm0, ymm0, ymm0
+ vpor ymm0, ymm0, ymm5
+ vpor ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
ret
}
}
+#endif // HAS_J400TOARGBROW_AVX2
-__declspec(naked) __declspec(align(16))
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
-__asm {
- mov eax, [esp + 4] // src_abgr
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- movdqa xmm5, kShuffleMaskABGRToARGB
- sub edx, eax
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
-__asm {
- mov eax, [esp + 4] // src_rgba
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- movdqa xmm5, kShuffleMaskRGBAToARGB
- sub edx, eax
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
-__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgba
- mov ecx, [esp + 12] // pix
- movdqa xmm5, kShuffleMaskARGBToRGBA
- sub edx, eax
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-__asm {
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ __asm {
mov eax, [esp + 4] // src_rgb24
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- movdqa xmm4, kShuffleMaskRGB24ToARGB
+ movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -256,35 +369,34 @@
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int pix) {
-__asm {
+ int width) {
+ __asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- movdqa xmm4, kShuffleMaskRAWToARGB
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -296,18 +408,46 @@
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
- movdqa [edx + 32], xmm2
+ movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
- movdqa [edx + 16], xmm1
+ movdqu [edx + 16], xmm1
por xmm3, xmm5
- sub ecx, 16
- movdqa [edx + 48], xmm3
+ movdqu [edx + 48], xmm3
lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked)
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_rgb24
+ mov ecx, [esp + 12] // width
+ movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+ movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 4]
+ movdqu xmm2, [eax + 8]
+ lea eax, [eax + 24]
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 8
jg convertloop
ret
}
@@ -320,10 +460,10 @@
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int pix) {
-__asm {
+ int width) {
+ __asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
pshufd xmm5, xmm5, 0
@@ -340,11 +480,10 @@
mov eax, [esp + 4] // src_rgb565
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
- align 16
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0
@@ -361,8 +500,8 @@
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -370,11 +509,158 @@
}
}
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
+ vpsllw ymm4, ymm4, 10
+ vpsrlw ymm4, ymm4, 5
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
+ vpand ymm1, ymm0, ymm3 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpand ymm0, ymm0, ymm4 // G in middle 6 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
+ vpor ymm0, ymm0, ymm7 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
+ vpsllw ymm1, ymm0, 1 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpand ymm1, ymm1, ymm3
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpsraw ymm2, ymm0, 8 // A
+ vpand ymm0, ymm0, ymm4 // G in middle 5 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
+ vpand ymm2, ymm2, ymm7
+ vpor ymm0, ymm0, ymm2 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+ vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
+ vpand ymm2, ymm0, ymm5 // mask high nibbles
+ vpand ymm0, ymm0, ymm4 // mask low nibbles
+ vpsrlw ymm3, ymm2, 4
+ vpsllw ymm1, ymm0, 4
+ vpor ymm2, ymm2, ymm3
+ vpor ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm2, ymm2, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm2
+ vpunpcklbw ymm0, ymm0, ymm2
+ vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB4444TOARGBROW_AVX2
+
// 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int pix) {
-__asm {
+ int width) {
+ __asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
pshufd xmm5, xmm5, 0
@@ -390,11 +676,10 @@
mov eax, [esp + 4] // src_argb1555
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
- align 16
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0
@@ -415,8 +700,8 @@
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
- movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -425,10 +710,10 @@
}
// 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int pix) {
-__asm {
+ int width) {
+ __asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
movd xmm4, eax
pshufd xmm4, xmm4, 0
@@ -436,11 +721,10 @@
pslld xmm5, 4
mov eax, [esp + 4] // src_argb4444
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
- align 16
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0
@@ -455,8 +739,8 @@
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
- movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
lea eax, [eax + 16]
sub ecx, 8
jg convertloop
@@ -464,20 +748,19 @@
}
}
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- movdqa xmm6, kShuffleMaskARGBToRGB24
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
- align 16
convertloop:
- movdqa xmm0, [eax] // fetch 16 pixels of argb
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
@@ -489,13 +772,13 @@
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqa [edx], xmm0 // store 0
+ movdqu [edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqa [edx + 16], xmm1 // store 1
- movdqa [edx + 32], xmm2 // store 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -503,20 +786,19 @@
}
}
-__declspec(naked) __declspec(align(16))
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
- movdqa xmm6, kShuffleMaskARGBToRAW
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
- align 16
convertloop:
- movdqa xmm0, [eax] // fetch 16 pixels of argb
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
@@ -528,13 +810,13 @@
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqa [edx], xmm0 // store 0
+ movdqu [edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqa [edx + 16], xmm1 // store 1
- movdqa [edx + 32], xmm2 // store 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -542,12 +824,12 @@
}
}
-__declspec(naked) __declspec(align(16))
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
@@ -556,9 +838,8 @@
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
- align 16
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
@@ -572,7 +853,7 @@
por xmm0, xmm1 // BGR
packssdw xmm0, xmm0
lea eax, [eax + 16]
- movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
lea edx, [edx + 8]
sub ecx, 4
jg convertloop
@@ -580,13 +861,101 @@
}
}
-// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) {
+ __asm {
+
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
+ movd xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ punpcklbw xmm6, xmm6 // make dither 16 bytes
+ movdqa xmm7, xmm6
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ paddusb xmm0, xmm6 // add dither
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+ const uint32 dither4, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ vbroadcastss xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
+ vpermq ymm6, ymm6, 0xd8
+ vpunpcklwd ymm6, ymm6, ymm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpaddusb ymm0, ymm0, ymm6 // add dither
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
psrld xmm4, 27
movdqa xmm5, xmm4 // generate mask 0x000003e0
@@ -596,9 +965,8 @@
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15
- align 16
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
movdqa xmm3, xmm0 // R
@@ -623,25 +991,24 @@
}
}
-__declspec(naked) __declspec(align(16))
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
psllw xmm4, 12
movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8
- align 16
convertloop:
- movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
pand xmm0, xmm3 // low nibble
pand xmm1, xmm4 // high nibble
- psrl xmm0, 4
- psrl xmm1, 8
+ psrld xmm0, 4
+ psrld xmm1, 8
por xmm0, xmm1
packuswb xmm0, xmm0
lea eax, [eax + 16]
@@ -653,51 +1020,124 @@
}
}
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4
+ vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
+ vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
+ vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
+ vpslld ymm7, ymm7, 15
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm3, ymm0, 9 // R
+ vpsrld ymm2, ymm0, 6 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrad ymm0, ymm0, 16 // A
+ vpand ymm3, ymm3, ymm6 // R
+ vpand ymm2, ymm2, ymm5 // G
+ vpand ymm1, ymm1, ymm4 // B
+ vpand ymm0, ymm0, ymm7 // A
+ vpor ymm0, ymm0, ymm1 // BA
+ vpor ymm2, ymm2, ymm3 // GR
+ vpor ymm0, ymm0, ymm2 // BGRA
+ vpackssdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
+ vpsllw ymm4, ymm4, 12
+ vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpand ymm1, ymm0, ymm4 // high nibble
+ vpand ymm0, ymm0, ymm3 // low nibble
+ vpsrld ymm1, ymm1, 8
+ vpsrld ymm0, ymm0, 4
+ vpor ymm0, ymm0, ymm1
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB4444ROW_AVX2
+
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToY
+ movdqa xmm5, xmmword ptr kAddY16
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
-
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -714,29 +1154,30 @@
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
- align 16
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
@@ -744,28 +1185,113 @@
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ paddw xmm2, xmm5
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToY
+ vbroadcastf128 ymm5, xmmword ptr kAddY16
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
- align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vpaddb ymm0, ymm0, ymm5 // add 16 for Y
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+ vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
+ vpaddw ymm2, ymm2, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kBGRAToY
+ movdqa xmm5, xmmword ptr kAddY16
+
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -782,58 +1308,23 @@
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kABGRToY
+ movdqa xmm5, xmmword ptr kAddY16
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -850,58 +1341,23 @@
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-__asm {
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+ __asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kRGBAToY
+ movdqa xmm5, xmmword ptr kAddY16
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -918,111 +1374,45 @@
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm5
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+ __asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
- align 16
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
-__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1050,10 +1440,10 @@
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1062,103 +1452,298 @@
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm6, xmmword ptr kARGBToVJ
+ movdqa xmm7, xmmword ptr kARGBToUJ
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+ vpaddb ymm0, ymm0, ymm5 // -> unsigned
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked)
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
+ vpaddw ymm0, ymm0, ymm5
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqu [edx + edi], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+ __asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
- movdqa xmm5, kAddUV128
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kBGRAToV
+ movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v
- align 16
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
-__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1186,10 +1771,10 @@
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1198,103 +1783,37 @@
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+ __asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kABGRToV
+ movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v
- align 16
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
-__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1322,10 +1841,10 @@
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1334,103 +1853,37 @@
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
-__asm {
+ __asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kRGBAToV
+ movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v
- align 16
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
- pavgb xmm0, [eax + esi]
- pavgb xmm1, [eax + esi + 16]
- pavgb xmm2, [eax + esi + 32]
- pavgb xmm3, [eax + esi + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
-__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
+
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
@@ -1458,10 +1911,10 @@
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ sub ecx, 16
jg convertloop
pop edi
@@ -1471,62 +1924,505 @@
}
#endif // HAS_ARGBTOYROW_SSSE3
-#ifdef HAS_I422TOARGBROW_SSSE3
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm { \
+ __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ }
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ }
-#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
-#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
-#define UR 0
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vpermq ymm5, ymm5, 0xd8 \
+ __asm lea ebp, [ebp + 16] \
+ }
-#define VB 0
-#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
-#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm { \
+ __asm vmovd xmm0, dword ptr [esi] /* U */ \
+ __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ }
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ }
-static const vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ }
-static const vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 32] \
+ }
-static const vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 32] \
+ }
-static const vec8 kVUToB = {
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm { \
+ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
+ __asm vpsubw ymm2, ymm3, ymm2 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
+ __asm vpsubw ymm1, ymm3, ymm1 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
+ __asm vpsubw ymm0, ymm3, ymm0 \
+ /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
+ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vpsraw ymm0, ymm0, 6 \
+ __asm vpsraw ymm1, ymm1, 6 \
+ __asm vpsraw ymm2, ymm2, 6 \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ }
-static const vec8 kVUToR = {
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vmovdqu 0[edx], ymm1 \
+ __asm vmovdqu 32[edx], ymm0 \
+ __asm lea edx, [edx + 64] \
+ }
-static const vec8 kVUToG = {
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vmovdqu [edx], ymm0 \
+ __asm vmovdqu [edx + 32], ymm1 \
+ __asm lea edx, [edx + 64] \
+ }
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
-// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ convertloop:
+ READYUV444_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // abgr
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV411_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV12_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV21_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUY2_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READUYVY_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // abgr
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STORERGBA_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
-// Read 8 UV from 411.
+// Read 8 UV from 444.
#define READYUV444 __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
- __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
}
// Read 4 UV from 422, upsample to 8 UV.
@@ -1536,45 +2432,99 @@
__asm lea esi, [esi + 4] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ }
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8] \
}
// Read 2 UV from 411, upsample to 8 UV.
-#define READYUV411 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
+// __asm pinsrw xmm0, [esi], 0 /* U */
+// __asm pinsrw xmm1, [esi + edi], 0 /* V */
+#define READYUV411_EBX __asm { \
+ __asm movzx ebx, word ptr [esi] /* U */ \
+ __asm movd xmm0, ebx \
+ __asm movzx ebx, word ptr [esi + edi] /* V */ \
+ __asm movd xmm1, ebx \
__asm lea esi, [esi + 2] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
}
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ }
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm lea esi, [esi + 8] \
+ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ }
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 16] \
+ }
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 16] \
}
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm { \
- /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+#define YUVTORGB(YuvConstants) __asm { \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
- /* Step 2: Find Y contribution to 8 R,G,B values */ \
- __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
- __asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
- __asm paddsw xmm0, xmm3 /* B += Y */ \
- __asm paddsw xmm1, xmm3 /* G += Y */ \
- __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm movdqa xmm3, xmm0 \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
+ __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm psubw xmm0, xmm1 \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm psubw xmm1, xmm2 \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
+ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm psubw xmm2, xmm3 \
+ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
+ __asm paddsw xmm0, xmm4 /* B += Y */ \
+ __asm paddsw xmm1, xmm4 /* G += Y */ \
+ __asm paddsw xmm2, xmm4 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
@@ -1583,721 +2533,515 @@
__asm packuswb xmm2, xmm2 /* R */ \
}
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm { \
- /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+// Store 8 ARGB values.
+#define STOREARGB __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
- __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
- __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
- __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
- __asm psubw xmm1, kUVBiasG \
- __asm psubw xmm2, kUVBiasR \
- /* Step 2: Find Y contribution to 8 R,G,B values */ \
- __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
- __asm lea eax, [eax + 8] \
- __asm punpcklbw xmm3, xmm4 \
- __asm psubsw xmm3, kYSub16 \
- __asm pmullw xmm3, kYToRgb \
- __asm paddsw xmm0, xmm3 /* B += Y */ \
- __asm paddsw xmm1, xmm3 /* G += Y */ \
- __asm paddsw xmm2, xmm3 /* R += Y */ \
- __asm psraw xmm0, 6 \
- __asm psraw xmm1, 6 \
- __asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm0 \
+ __asm movdqu 16[edx], xmm1 \
+ __asm lea edx, [edx + 32] \
}
-// 8 pixels, dest aligned 16.
+// Store 8 BGRA values.
+#define STOREBGRA __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32] \
+ }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm { \
+ /* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24] \
+ }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm { \
+ /* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
+ __asm packssdw xmm0, xmm1 \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm lea edx, [edx + 16] \
+ }
+
+// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* argb_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- align 16
convertloop:
READYUV444
- YUVTORGB
+ YUVTORGB(ebx)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
+ pop ebx
pop edi
pop esi
ret
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB24
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb565_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ psrld xmm5, 27
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ psrld xmm6, 26
+ pslld xmm6, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pslld xmm7, 11
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB565
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* argb_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 16
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(ebx)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
+ pop ebx
pop edi
pop esi
ret
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* argb_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov edx, [esp + 16 + 16] // abgr
+ mov ebp, [esp + 16 + 20] // yuvconstants
+ mov ecx, [esp + 16 + 24] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- align 16
convertloop:
- READYUV411
- YUVTORGB
+ READYUV411_EBX
+ YUVTORGB(ebp)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
+ pop ebp
+ pop ebx
pop edi
pop esi
ret
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
- uint8* argb_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // UV
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 16
convertloop:
READNV12
- YUVTORGB
+ YUVTORGB(ebx)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
+ pop ebx
pop esi
ret
}
}
-// 8 pixels, dest aligned 16.
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* argb_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- __asm {
- push esi
- push edi
+ push ebx
mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
- sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 16
convertloop:
- READYUV444
- YUVTORGB
+ READNV21
+ YUVTORGB(ebx)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
- pop edi
+ pop ebx
pop esi
ret
}
}
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* argb_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV411
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* argb_buf,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // UV
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READNV12
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* argb_buf,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* bgra_buf,
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
+
+ convertloop:
+ READYUY2
+ YUVTORGB(ebx)
+ STOREARGB
+
sub ecx, 8
jg convertloop
- pop edi
- pop esi
+ pop ebx
ret
}
}
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* bgra_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* abgr_buf,
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
- align 16
convertloop:
- READYUV422
- YUVTORGB
+ READUYVY
+ YUVTORGB(ebx)
+ STOREARGB
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
- pop edi
- pop esi
+ pop ebx
ret
}
}
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* abgr_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgba_buf,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
sub edi, esi
- pxor xmm4, xmm4
- align 16
convertloop:
READYUV422
- YUVTORGB
+ YUVTORGB(ebx)
+ STORERGBA
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqa [edx], xmm5
- movdqa [edx + 16], xmm0
- lea edx, [edx + 32]
sub ecx, 8
jg convertloop
+ pop ebx
pop edi
pop esi
ret
}
}
-
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgba_buf,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 16
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
#endif // HAS_I422TOARGBROW_SSSE3
-#ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
__asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ movd xmm2, eax
+ pshufd xmm2, xmm2,0
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
- mov eax,0x10001000
- movd xmm3,eax
- pshufd xmm3,xmm3,0
- mov eax,0x012a012a
- movd xmm2,eax
- pshufd xmm2,xmm2,0
+
mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
- align 16
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0 // Y.Y
- psubusw xmm0, xmm3
pmulhuw xmm0, xmm2
+ psubusw xmm0, xmm3
+ psrlw xmm0, 6
packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
@@ -2307,84 +3051,125 @@
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm4
por xmm1, xmm4
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
-
ret
}
}
-#endif // HAS_YTOARGBROW_SSE2
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ vmovd xmm2, eax
+ vbroadcastss ymm2, xmm2
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ vmovd xmm3, eax
+ vbroadcastss ymm3, xmm3
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
+ vpslld ymm4, ymm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
+ vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
+ vpmulhuw ymm0, ymm0, ymm2
+ vpsubusw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 6
+ vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
+
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
+ vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm4
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I400TOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
-
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleMirror = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
-__declspec(naked) __declspec(align(16))
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
+ __asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- movdqa xmm5, kShuffleMirror
- lea eax, [eax - 16]
+ movdqa xmm5, xmmword ptr kShuffleMirror
- align 16
convertloop:
- movdqa xmm0, [eax + ecx]
+ movdqu xmm0, [eax - 16 + ecx]
pshufb xmm0, xmm5
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
#endif // HAS_MIRRORROW_SSSE3
-#ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
-__declspec(naked) __declspec(align(16))
-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
-__asm {
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- lea eax, [eax - 16]
+ vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
- align 16
convertloop:
- movdqu xmm0, [eax + ecx]
- movdqa xmm1, xmm0 // swap bytes
- psllw xmm0, 8
- psrlw xmm1, 8
- por xmm0, xmm1
- pshuflw xmm0, xmm0, 0x1b // swap words
- pshufhw xmm0, xmm0, 0x1b
- pshufd xmm0, xmm0, 0x4e // swap qwords
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
+ vmovdqu ymm0, [eax - 32 + ecx]
+ vpshufb ymm0, ymm0, ymm5
+ vpermq ymm0, ymm0, 0x4e // swap high and low halfs
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
jg convertloop
+ vzeroupper
ret
}
}
-#endif // HAS_MIRRORROW_SSE2
+#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORROW_UV_SSSE3
+#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
-__declspec(naked) __declspec(align(16))
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
int width) {
__asm {
push edi
@@ -2392,73 +3177,91 @@
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
- movdqa xmm1, kShuffleMirrorUV
+ movdqa xmm1, xmmword ptr kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
- align 16
convertloop:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
- sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
+ sub ecx, 8
jg convertloop
pop edi
ret
}
}
-#endif // HAS_MIRRORROW_UV_SSSE3
+#endif // HAS_MIRRORUVROW_SSSE3
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
- 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- movdqa xmm5, kARGBShuffleMirror
- lea eax, [eax - 16]
+ lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
- align 16
convertloop:
- movdqa xmm0, [eax + ecx * 4]
- pshufb xmm0, xmm5
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufd xmm0, xmm0, 0x1b
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
ret
}
}
-#endif // HAS_ARGBMIRRORROW_SSSE3
+#endif // HAS_ARGBMIRRORROW_SSE2
-#ifdef HAS_SPLITUV_SSE2
-__declspec(naked) __declspec(align(16))
-void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+
+ convertloop:
+ vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
- align 16
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
movdqa xmm3, xmm1
@@ -2468,8 +3271,8 @@
psrlw xmm2, 8 // odd bytes
psrlw xmm3, 8
packuswb xmm2, xmm3
- movdqa [edx], xmm0
- movdqa [edx + edi], xmm2
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
lea edx, [edx + 16]
sub ecx, 16
jg convertloop
@@ -2478,58 +3281,363 @@
ret
}
}
-#endif // HAS_SPLITUV_SSE2
+
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm3, ymm1, 8
+ vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 0xd8
+ vpermq ymm2, ymm2, 0xd8
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + edi], ymm2
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
+ lea eax, [eax + 32]
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vextractf128 [edi], ymm2, 0 // bytes 0..15
+ vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
+ vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
+ vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
+ lea edi, [edi + 64]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- sub edx, eax
+ test eax, 15
+ jne convertloopu
+ test edx, 15
+ jne convertloopu
- align 16
- convertloop:
+ convertloopa:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
- movdqa [eax + edx], xmm0
- movdqa [eax + edx + 16], xmm1
lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
sub ecx, 32
- jg convertloop
+ jg convertloopa
+ ret
+
+ convertloopu:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloopu
ret
}
}
#endif // HAS_COPYROW_SSE2
-#ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 64
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
- shr ecx, 2
- rep movsd
+ rep movsb
mov edi, edx
mov esi, eax
ret
}
}
-#endif // HAS_COPYROW_X86
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + 32]
+ lea eax, [eax + 64]
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+
+ extractloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm0, 24
+ psrld xmm1, 24
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg extractloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movq xmm2, qword ptr [eax] // 8 Y's
+ lea eax, [eax + 8]
+ punpcklbw xmm2, xmm2
+ punpckhwd xmm3, xmm2
+ punpcklwd xmm2, xmm2
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vpmovzxbd ymm1, qword ptr [eax]
+ vpmovzxbd ymm2, qword ptr [eax + 8]
+ lea eax, [eax + 16]
+ vpslld ymm1, ymm1, 24
+ vpslld ymm2, ymm2, 24
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRow8_X86(uint8* dst, uint32 v32, int count) {
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm {
+ movzx eax, byte ptr [esp + 8] // v8
+ mov edx, 0x01010101 // Duplicate byte to all bytes.
+ mul edx // overwrites edx with upper part of result.
mov edx, edi
mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
@@ -2538,68 +3646,283 @@
}
}
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRows32_X86(uint8* dst, uint32 v32, int width,
- int dst_stride, int height) {
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm {
- push esi
- push edi
- push ebp
- mov edi, [esp + 12 + 4] // dst
- mov eax, [esp + 12 + 8] // v32
- mov ebp, [esp + 12 + 12] // width
- mov edx, [esp + 12 + 16] // dst_stride
- mov esi, [esp + 12 + 20] // height
- lea ecx, [ebp * 4]
- sub edx, ecx // stride - width * 4
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v8
+ mov ecx, [esp + 12] // count
+ rep stosb
+ mov edi, edx
+ ret
+ }
+}
- align 16
- convertloop:
- mov ecx, ebp
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
rep stosd
- add edi, edx
- sub esi, 1
- jg convertloop
-
- pop ebp
- pop edi
- pop esi
+ mov edi, edx
ret
}
}
#endif // HAS_SETROW_X86
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_y, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
+ uint8* dst_y, int width) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 16
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
@@ -2607,17 +3930,16 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
+ mov ecx, [esp + 8 + 20] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
- align 16
convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
@@ -2641,127 +3963,19 @@
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 16
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -2785,120 +3999,14 @@
}
}
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
+ uint8* dst_y, int width) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
+ mov ecx, [esp + 12] // width
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 16
- convertloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -2906,17 +4014,17 @@
psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
- sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
+ sub ecx, 16
jg convertloop
ret
}
}
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
@@ -2924,12 +4032,11 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
+ mov ecx, [esp + 8 + 20] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -2958,20 +4065,19 @@
}
}
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
- align 16
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -2996,127 +4102,122 @@
}
#endif // HAS_YUY2TOYROW_SSE2
-#ifdef HAS_ARGBBLENDROW_SSE2
+#ifdef HAS_BLENDPLANEROW_SSSE3
// Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 1
- psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
- psrlw xmm6, 8
+ push edi
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0x00
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ movd xmm7, eax
+ pshufd xmm7, xmm7, 0x00
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3,0F5h // 8 alpha words
- pshuflw xmm3, xmm3,0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge alignloop1
+ // 8 pixel loop.
+ convertloop8:
+ movq xmm0, qword ptr [esi] // alpha
+ punpcklbw xmm0, xmm0
+ pxor xmm0, xmm5 // a, 255-a
+ movq xmm1, qword ptr [eax + esi] // src0
+ movq xmm2, qword ptr [edx + esi] // src1
+ punpcklbw xmm1, xmm2
+ psubb xmm1, xmm6 // bias src0/1 - 128
+ pmaddubsw xmm0, xmm1
+ paddw xmm0, xmm7 // unbias result - 32768 and round.
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edi + esi], xmm0
+ lea esi, [esi + 8]
+ sub ecx, 8
+ jg convertloop8
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
-
- // 4 pixel loop.
- convertloop4:
- movdqu xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3,0F5h // 8 alpha words
- pshuflw xmm3, xmm3,0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
-
- convertloop4b:
- add ecx, 4 - 1
- jl convertloop1b
-
- // 1 pixel loop.
- convertloop1:
- movd xmm3, [eax] // src argb
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- psrlw xmm3, 8 // alpha
- pshufhw xmm3, xmm3,0F5h // 8 alpha words
- pshuflw xmm3, xmm3,0F5h
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge convertloop1
-
- convertloop1b:
+ pop edi
pop esi
ret
}
}
-#endif // HAS_ARGBBLENDROW_SSE2
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+ const uint8* alpha, uint8* dst, int width) {
+ __asm {
+ push esi
+ push edi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
+ vpsllw ymm5, ymm5, 8
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ vmovd xmm7, eax
+ vbroadcastss ymm7, xmm7
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
+
+ // 32 pixel loop.
+ convertloop32:
+ vmovdqu ymm0, [esi] // alpha
+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
+ vpxor ymm3, ymm3, ymm5 // a, 255-a
+ vpxor ymm0, ymm0, ymm5 // a, 255-a
+ vmovdqu ymm1, [eax + esi] // src0
+ vmovdqu ymm2, [edx + esi] // src1
+ vpunpckhbw ymm4, ymm1, ymm2
+ vpunpcklbw ymm1, ymm1, ymm2
+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
+ vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpmaddubsw ymm0, ymm0, ymm1
+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
+ vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
+ vpsrlw ymm3, ymm3, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm3
+ vmovdqu [edi + esi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg convertloop32
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
@@ -3124,15 +4225,9 @@
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
};
-// Same as SSE2, but replaces:
-// psrlw xmm3, 8 // alpha
-// pshufhw xmm3, xmm3,0F5h // 8 alpha words
-// pshuflw xmm3, xmm3,0F5h
-// with..
-// pshufb xmm3, kShuffleAlpha // alpha
-// Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+// Blend 8 pixels at a time.
+__declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
@@ -3141,7 +4236,7 @@
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 1
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
psrlw xmm7, 15
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8
@@ -3149,81 +4244,17 @@
psllw xmm5, 8
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
-
- sub ecx, 1
- je convertloop1 // only 1 pixel?
- jl convertloop1b
-
- // 1 pixel loop until destination pointer is aligned.
- alignloop1:
- test edx, 15 // aligned?
- je alignloop1b
- movd xmm3, [eax]
- lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
- lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
- movd [edx], xmm0
- lea edx, [edx + 4]
- jge alignloop1
-
- alignloop1b:
- add ecx, 1 - 4
- jl convertloop4b
-
- test eax, 15 // unaligned?
- jne convertuloop4
- test esi, 15 // unaligned?
- jne convertuloop4
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
convertloop4:
- movdqa xmm3, [eax] // src argb
- lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqa xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqa xmm1, [esi] // _a_g
- lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jge convertloop4
- jmp convertloop4b
-
- // 4 pixel unaligned loop.
- convertuloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
@@ -3236,10 +4267,10 @@
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 4
- movdqa [edx], xmm0
+ movdqu [edx], xmm0
lea edx, [edx + 16]
- jge convertuloop4
+ sub ecx, 4
+ jge convertloop4
convertloop4b:
add ecx, 4 - 1
@@ -3252,7 +4283,7 @@
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b
- pshufb xmm3, kShuffleAlpha // alpha
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
@@ -3265,9 +4296,9 @@
paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge convertloop1
convertloop1b:
@@ -3277,50 +4308,6 @@
}
#endif // HAS_ARGBBLENDROW_SSSE3
-#ifdef HAS_ARGBATTENUATE_SSE2
-// Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- sub edx, eax
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
- pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
- psrld xmm5, 8
-
- align 16
- convertloop:
- movdqa xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm0 // first 2
- pshufhw xmm2, xmm0,0FFh // 8 alpha words
- pshuflw xmm2, xmm2,0FFh
- pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm1 // next 2 pixels
- pshufhw xmm2, xmm1,0FFh // 8 alpha words
- pshuflw xmm2, xmm2,0FFh
- pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // alphas
- psrlw xmm0, 8
- pand xmm2, xmm4
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- pand xmm0, xmm5 // keep original alphas
- por xmm0, xmm2
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBATTENUATE_SSE2
-
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {
@@ -3330,39 +4317,38 @@
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- sub edx, eax
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24
- movdqa xmm4, kShuffleAlpha0
- movdqa xmm5, kShuffleAlpha1
+ movdqa xmm4, xmmword ptr kShuffleAlpha0
+ movdqa xmm5, xmmword ptr kShuffleAlpha1
- align 16
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpcklbw xmm1, xmm1 // first 2 pixel rgbs
pmulhuw xmm0, xmm1 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqa xmm2, [eax] // read 4 pixels
+ movdqu xmm2, [eax] // read 4 pixels
punpckhbw xmm2, xmm2 // next 2 pixel rgbs
pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // mask original alpha
+ movdqu xmm2, [eax] // mask original alpha
+ lea eax, [eax + 16]
pand xmm2, xmm3
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
jg convertloop
ret
@@ -3370,88 +4356,229 @@
}
#endif // HAS_ARGBATTENUATEROW_SSSE3
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpshufb ymm2, ymm0, ymm4 // low 4 alphas
+ vpshufb ymm3, ymm1, ymm4 // high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * a
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * a
+ vpand ymm6, ymm6, ymm5 // isolate alpha
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpor ymm0, ymm0, ymm6 // copy original alpha
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
+ push ebx
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb0
- mov edx, [esp + 8 + 8] // dst_argb
- mov ecx, [esp + 8 + 12] // width
- sub edx, eax
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
- pslld xmm4, 24
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ lea ebx, fixed_invtbl8
- align 16
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha
punpcklbw xmm0, xmm0 // first 2
- movd xmm2, dword ptr fixed_invtbl8[esi * 4]
- movd xmm3, dword ptr fixed_invtbl8[edi * 4]
- pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
- pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha
punpckhbw xmm1, xmm1 // next 2
- movd xmm2, dword ptr fixed_invtbl8[esi * 4]
- movd xmm3, dword ptr fixed_invtbl8[edi * 4]
- pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
- pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
-
- movdqa xmm2, [eax] // alphas
- pand xmm2, xmm4
- packuswb xmm0, xmm1
- por xmm0, xmm2
- sub ecx, 4
- movdqa [eax + edx], xmm0
lea eax, [eax + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
jg convertloop
+
pop edi
pop esi
+ pop ebx
ret
}
}
#endif // HAS_ARGBUNATTENUATEROW_SSE2
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
-static const vec8 kARGBToGray = {
- 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#else // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ sub edx, eax
+ lea ebx, fixed_invtbl8
+ vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ // end of VPGATHER
+
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // USE_GATHER
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* width */
- movdqa xmm4, kARGBToGray
- sub edx, eax
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
- align 16
convertloop:
- movdqa xmm0, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm0, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 G bytes
- movdqa xmm2, [eax] // A
- movdqa xmm3, [eax + 16]
+ movdqu xmm2, [eax] // A
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
packuswb xmm2, xmm3
@@ -3462,10 +4589,10 @@
movdqa xmm1, xmm0
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
sub ecx, 8
- movdqa [eax + edx], xmm0
- movdqa [eax + edx + 16], xmm1
- lea eax, [eax + 32]
jg convertloop
ret
}
@@ -3490,41 +4617,40 @@
};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* dst_argb */
mov ecx, [esp + 8] /* width */
- movdqa xmm2, kARGBToSepiaB
- movdqa xmm3, kARGBToSepiaG
- movdqa xmm4, kARGBToSepiaR
+ movdqa xmm2, xmmword ptr kARGBToSepiaB
+ movdqa xmm3, xmmword ptr kARGBToSepiaG
+ movdqa xmm4, xmmword ptr kARGBToSepiaR
- align 16
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm6, [eax + 16]
pmaddubsw xmm0, xmm2
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values
- movdqa xmm5, [eax] // G
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // G
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
+ movdqu xmm5, [eax] // R
+ movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values
- movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
+ movdqu xmm6, [eax] // A
+ movdqu xmm1, [eax + 16]
psrld xmm6, 24
psrld xmm1, 24
packuswb xmm6, xmm1
@@ -3533,10 +4659,10 @@
movdqa xmm1, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4
- sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
+ movdqu [eax], xmm0
+ movdqu [eax + 16], xmm1
lea eax, [eax + 32]
+ sub ecx, 8
jg convertloop
ret
}
@@ -3548,116 +4674,68 @@
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
- int width) {
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
__asm {
- mov eax, [esp + 4] /* dst_argb */
- mov edx, [esp + 8] /* matrix_argb */
- mov ecx, [esp + 12] /* width */
- movd xmm2, [edx]
- movd xmm3, [edx + 4]
- movd xmm4, [edx + 8]
- pshufd xmm2, xmm2, 0
- pshufd xmm3, xmm3, 0
- pshufd xmm4, xmm4, 0
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
+ movdqu xmm5, [ecx]
+ pshufd xmm2, xmm5, 0x00
+ pshufd xmm3, xmm5, 0x55
+ pshufd xmm4, xmm5, 0xaa
+ pshufd xmm5, xmm5, 0xff
+ mov ecx, [esp + 16] /* width */
- align 16
convertloop:
- movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
+ movdqu xmm0, [eax] // B
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm0, xmm2
- pmaddubsw xmm6, xmm2
- movdqa xmm5, [eax] // G
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm3
+ pmaddubsw xmm7, xmm2
+ movdqu xmm6, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm6 // B
- phaddsw xmm5, xmm1 // G
- psraw xmm0, 7 // B
- psraw xmm5, 7 // G
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
packuswb xmm0, xmm0 // 8 B values
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm4
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
+ movdqu xmm1, [eax] // R
+ movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
- phaddsw xmm5, xmm1
- psraw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
- movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
- psrld xmm6, 24
- psrld xmm1, 24
- packuswb xmm6, xmm1
+ pmaddubsw xmm7, xmm4
+ phaddsw xmm1, xmm7 // R
+ movdqu xmm6, [eax] // A
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm6, xmm5
+ pmaddubsw xmm7, xmm5
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
packuswb xmm6, xmm6 // 8 A values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklbw xmm5, xmm6 // 8 RA values
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
- sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm6
lea eax, [eax + 32]
+ lea edx, [edx + 32]
+ sub ecx, 8
jg convertloop
ret
}
}
#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp + 16 + 4] /* dst_argb */
- mov edi, [esp + 16 + 8] /* table_argb */
- mov ecx, [esp + 16 + 12] /* width */
- xor ebx, ebx
- xor edx, edx
-
- align 16
- convertloop:
- mov ebp, dword ptr [eax] // BGRA
- mov esi, ebp
- and ebp, 255
- shr esi, 8
- and esi, 255
- mov bl, [edi + ebp * 4 + 0] // B
- mov dl, [edi + esi * 4 + 1] // G
- mov ebp, dword ptr [eax] // BGRA
- mov esi, ebp
- shr ebp, 16
- shr esi, 24
- and ebp, 255
- mov [eax], bl
- mov [eax + 1], dl
- mov bl, [edi + ebp * 4 + 2] // R
- mov dl, [edi + esi * 4 + 3] // A
- mov [eax + 2], bl
- mov [eax + 3], dl
- lea eax, [eax + 4]
- sub ecx, 1
- jg convertloop
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-#endif // HAS_ARGBCOLORTABLEROW_X86
-
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
__asm {
@@ -3676,32 +4754,514 @@
pcmpeqb xmm6, xmm6 // generate mask 0xff000000
pslld xmm6, 24
- align 16
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size
- movdqa xmm7, [eax] // read 4 pixels
+ movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
- sub ecx, 4
- movdqa [eax], xmm0
+ movdqu [eax], xmm0
lea eax, [eax + 16]
+ sub ecx, 4
jg convertloop
ret
}
}
#endif // HAS_ARGBQUANTIZEROW_SSE2
-#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ lea eax, [eax + 16]
+ lea esi, [esi + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ sub ecx, 4
+ jl convertloop49
+
+ convertloop4:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
+
+ convertloop49:
+ add ecx, 4 - 1
+ jl convertloop19
+
+ convertloop1:
+ movd xmm0, [eax] // read 1 pixels from src_argb0
+ lea eax, [eax + 4]
+ movd xmm1, [esi] // read 1 pixels from src_argb1
+ lea esi, [esi + 4]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop19:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ convertloop:
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpackuswb ymm0, ymm0, ymm1
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
+ mov edi, [esp + 8 + 12] // src_y2
+ mov edx, [esp + 8 + 16] // dst_sobelx
+ mov ecx, [esp + 8 + 20] // width
+ sub esi, eax
+ sub edi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
+ mov edx, [esp + 4 + 12] // dst_sobely
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
+ por xmm2, xmm5
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
+ por xmm0, xmm5
+ movdqu [edx], xmm1
+ movdqu [edx + 16], xmm2
+ movdqu [edx + 32], xmm3
+ movdqu [edx + 48], xmm0
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
+ punpcklbw xmm3, xmm5
+ punpckhbw xmm0, xmm5
+ movdqa xmm4, xmm1 // YS
+ punpcklbw xmm4, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
+ movdqu [edx], xmm6
+ movdqu [edx + 16], xmm4
+ movdqu [edx + 32], xmm7
+ movdqu [edx + 48], xmm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
// Consider float CumulativeSum.
// Consider calling CumulativeSum one row at time as needed.
// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
@@ -3713,31 +5273,85 @@
// area is the number of pixels in the area being averaged.
// dst points to pixel to store result to.
// count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
-void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count) {
+// Does 4 pixels at a time.
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
__asm {
mov eax, topleft // eax topleft
mov esi, botleft // esi botleft
mov edx, width
- movd xmm4, area
+ movd xmm5, area
mov edi, dst
mov ecx, count
- cvtdq2ps xmm4, xmm4
- rcpss xmm4, xmm4 // 1.0f / area
+ cvtdq2ps xmm5, xmm5
+ rcpss xmm4, xmm5 // 1.0f / area
pshufd xmm4, xmm4, 0
sub ecx, 4
jl l4b
+ cmp area, 128 // 128 pixels will not overflow 15 bits.
+ ja l4
+
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ psrld xmm6, 16
+ cvtdq2ps xmm6, xmm6
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
+
+ // 4 pixel loop small blocks.
+ s4:
+ // top left
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
+ packssdw xmm2, xmm3
+
+ pmulhuw xmm0, xmm5
+ pmulhuw xmm2, xmm5
+
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge s4
+
+ jmp l4b
+
// 4 pixel loop
- align 4
l4:
// top left
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
@@ -3784,9 +5398,8 @@
jl l1b
// 1 pixel loop
- align 4
l1:
- movdqa xmm0, [eax]
+ movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
lea eax, [eax + 16]
psubd xmm0, [esi]
@@ -3804,7 +5417,7 @@
l1b:
}
}
-#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
@@ -3816,7 +5429,6 @@
mov edx, cumsum
mov esi, previous_cumsum
mov ecx, width
- sub esi, edx
pxor xmm0, xmm0
pxor xmm1, xmm1
@@ -3826,7 +5438,6 @@
jne l4b
// 4 pixel loop
- align 4
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@@ -3843,25 +5454,26 @@
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
- movdqa xmm2, [edx + esi] // previous row above.
+ movdqu xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
- movdqa xmm3, [edx + esi + 16]
+ movdqu xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
- movdqa xmm4, [edx + esi + 32]
+ movdqu xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
- movdqa xmm5, [edx + esi + 48]
+ movdqu xmm5, [esi + 48]
+ lea esi, [esi + 64]
paddd xmm5, xmm0
- movdqa [edx], xmm2
- movdqa [edx + 16], xmm3
- movdqa [edx + 32], xmm4
- movdqa [edx + 48], xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ movdqu [edx + 32], xmm4
+ movdqu [edx + 48], xmm5
lea edx, [edx + 64]
sub ecx, 4
@@ -3872,14 +5484,14 @@
jl l1b
// 1 pixel loop
- align 4
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
paddd xmm0, xmm2
- movdqu xmm2, [edx + esi]
+ movdqu xmm2, [esi]
+ lea esi, [esi + 16]
paddd xmm2, xmm0
movdqu [edx], xmm2
lea edx, [edx + 16]
@@ -3891,52 +5503,16 @@
}
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
-#ifdef HAS_ARGBSHADE_SSE2
-// Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- movd xmm2, [esp + 16] // value
- sub edx, eax
- punpcklbw xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- align 16
- convertloop:
- movdqa xmm0, [eax] // read 4 pixels
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBSHADE_SSE2
-
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
__asm {
push esi
push edi
- mov eax, [esp + 12] // src_argb
+ mov eax, [esp + 12] // src_argb
mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
@@ -3962,7 +5538,6 @@
addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
- align 4
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@@ -3984,9 +5559,9 @@
movd xmm0, [eax + edi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3
addps xmm3, xmm4 // x, y += dx, dy next 2
- sub ecx, 4
movq qword ptr 8[edx], xmm6
lea edx, [edx + 16]
+ sub ecx, 4
jge l4
l4b:
@@ -3994,7 +5569,6 @@
jl l1b
// 1 pixel loop
- align 4
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@@ -4002,9 +5576,9 @@
addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel
- sub ecx, 1
movd [edx], xmm0
lea edx, [edx + 4]
+ sub ecx, 1
jge l1
l1b:
pop edi
@@ -4014,11 +5588,12 @@
}
#endif // HAS_ARGBAFFINEROW_SSE2
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
-__declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
@@ -4027,61 +5602,357 @@
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
+ // Dispatch to specialized filters if applicable.
cmp eax, 0
- je xloop1
- cmp eax, 64
- je xloop2
- movd xmm0, eax // high fraction 0..127
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ sub edi, esi
+ cmp eax, 128
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+
+ vmovd xmm0, eax // high fraction 0..255
neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
+ add eax, 256
+ vmovd xmm5, eax // low fraction 256..1
+ vpunpcklbw xmm5, xmm5, xmm0
+ vpunpcklwd xmm5, xmm5, xmm5
+ vbroadcastss ymm5, xmm5
+
+ mov eax, 0x80808080 // 128b for bias and rounding.
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+
+ xloop:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm2, [esi + edx]
+ vpunpckhbw ymm1, ymm0, ymm2 // mutates
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm4 // bias to signed image
+ vpsubb ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm5, ymm1
+ vpmaddubsw ymm0, ymm5, ymm0
+ vpaddw ymm1, ymm1, ymm4 // unbias and round
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ rep movsb
+
+ xloop99:
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 /256. Blend 100 / 0.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+
+ movd xmm0, eax // high fraction 0..255
+ neg eax
+ add eax, 256
+ movd xmm5, eax // low fraction 255..1
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0x00
- align 16
xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
+ movdqu xmm0, [esi]
+ movdqu xmm2, [esi + edx]
+ movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [esi + edi], xmm0
+ psubb xmm0, xmm4 // bias image by -128
+ psubb xmm1, xmm4
+ movdqa xmm2, xmm5
+ movdqa xmm3, xmm5
+ pmaddubsw xmm2, xmm0
+ pmaddubsw xmm3, xmm1
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+ psrlw xmm2, 8
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [esi + edi], xmm2
lea esi, [esi + 16]
+ sub ecx, 16
jg xloop
+ jmp xloop99
+ // Blend 50 / 50.
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ movdqu xmm0, [esi]
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop100
+
+ xloop99:
pop edi
pop esi
ret
+ }
+}
- align 16
- xloop1:
- movdqa xmm0, [esi]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop1
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ movdqu xmm5, [ecx]
+ mov ecx, [esp + 16] // width
- pop edi
- pop esi
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
ret
+ }
+}
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
+ mov ecx, [esp + 16] // width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int width) {
+ __asm {
+ push ebx
+ push esi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov esi, [esp + 8 + 12] // shuffler
+ mov ecx, [esp + 8 + 16] // width
+ pxor xmm5, xmm5
+
+ mov ebx, [esi] // shuffler
+ cmp ebx, 0x03000102
+ je shuf_3012
+ cmp ebx, 0x00010203
+ je shuf_0123
+ cmp ebx, 0x00030201
+ je shuf_0321
+ cmp ebx, 0x02010003
+ je shuf_2103
+
+ // TODO(fbarchard): Use one source pointer and 3 offsets.
+ shuf_any1:
+ movzx ebx, byte ptr [esi]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx], bl
+ movzx ebx, byte ptr [esi + 1]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 1], bl
+ movzx ebx, byte ptr [esi + 2]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 2], bl
+ movzx ebx, byte ptr [esi + 3]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 3], bl
+ lea eax, [eax + 4]
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jg shuf_any1
+ jmp shuf99
+
+ shuf_0123:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
+ pshuflw xmm0, xmm0, 01Bh
+ pshufhw xmm1, xmm1, 01Bh
+ pshuflw xmm1, xmm1, 01Bh
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
+ jg shuf_0123
+ jmp shuf99
+
+ shuf_0321:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
+ pshuflw xmm0, xmm0, 039h
+ pshufhw xmm1, xmm1, 039h
+ pshuflw xmm1, xmm1, 039h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_0321
+ jmp shuf99
+
+ shuf_2103:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
+ pshuflw xmm0, xmm0, 093h
+ pshufhw xmm1, xmm1, 093h
+ pshuflw xmm1, xmm1, 093h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_2103
+ jmp shuf99
+
+ shuf_3012:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
+ pshuflw xmm0, xmm0, 0C6h
+ pshufhw xmm1, xmm1, 0C6h
+ pshuflw xmm1, xmm1, 0C6h
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg shuf_3012
+
+ shuf99:
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
pop edi
pop esi
@@ -4089,9 +5960,310 @@
}
}
-#endif // _M_IX86
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqu [edi], xmm1
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
+ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
+
+ // 2 pixel loop.
+ convertloop:
+// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ movq xmm0, qword ptr [eax] // BGRABGRA
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm3
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3 // pixel 0
+ punpckhwd xmm4, xmm3 // pixel 1
+ cvtdq2ps xmm0, xmm0 // 4 floats
+ cvtdq2ps xmm4, xmm4
+ movdqa xmm1, xmm0 // X
+ movdqa xmm5, xmm4
+ mulps xmm0, [esi + 16] // C1 * X
+ mulps xmm4, [esi + 16]
+ addps xmm0, [esi] // result = C0 + C1 * X
+ addps xmm4, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm5
+ mulps xmm2, xmm1 // X * X
+ mulps xmm6, xmm5
+ mulps xmm1, xmm2 // X * X * X
+ mulps xmm5, xmm6
+ mulps xmm2, [esi + 32] // C2 * X * X
+ mulps xmm6, [esi + 32]
+ mulps xmm1, [esi + 48] // C3 * X * X * X
+ mulps xmm5, [esi + 48]
+ addps xmm0, xmm2 // result += C2 * X * X
+ addps xmm4, xmm6
+ addps xmm0, xmm1 // result += C3 * X * X * X
+ addps xmm4, xmm5
+ cvttps2dq xmm0, xmm0
+ cvttps2dq xmm4, xmm4
+ packuswb xmm0, xmm4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
+ vbroadcastf128 ymm5, [ecx + 16] // C1
+ vbroadcastf128 ymm6, [ecx + 32] // C2
+ vbroadcastf128 ymm7, [ecx + 48] // C3
+ mov ecx, [esp + 16] /* width */
+
+ // 2 pixel loop.
+ convertloop:
+ vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
+ lea eax, [eax + 8]
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vmulps ymm2, ymm0, ymm0 // X * X
+ vmulps ymm3, ymm0, ymm7 // C3 * X
+ vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
+ vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
+ vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
+ vcvttps2dq ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
+ vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
+ vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
+ vmovq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ movzx edx, byte ptr [eax - 4 + 3]
+ movzx edx, byte ptr [esi + edx * 4 + 3]
+ mov byte ptr [eax - 4 + 3], dl
+ dec ecx
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ dec ecx
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
+ movd xmm2, dword ptr [esp + 8 + 16] // luma table
+ movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ psllw xmm4, 8
+ pxor xmm5, xmm5
+
+ // 4 pixel loop.
+ convertloop:
+ movdqu xmm0, xmmword ptr [eax] // generate luma ptr
+ pmaddubsw xmm0, xmm3
+ phaddw xmm0, xmm0
+ pand xmm0, xmm4 // mask out low bits
+ punpcklwd xmm0, xmm5
+ paddd xmm0, xmm2 // add table base
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi], dl
+ movzx edx, byte ptr [eax + 1]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 1], dl
+ movzx edx, byte ptr [eax + 2]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 2], dl
+ movzx edx, byte ptr [eax + 3] // copy alpha.
+ mov byte ptr [edi + 3], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 4]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 4], dl
+ movzx edx, byte ptr [eax + 5]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 5], dl
+ movzx edx, byte ptr [eax + 6]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 6], dl
+ movzx edx, byte ptr [eax + 7] // copy alpha.
+ mov byte ptr [edi + 7], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 8]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 8], dl
+ movzx edx, byte ptr [eax + 9]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 9], dl
+ movzx edx, byte ptr [eax + 10]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 10], dl
+ movzx edx, byte ptr [eax + 11] // copy alpha.
+ mov byte ptr [edi + 11], dl
+
+ movd esi, xmm0
+
+ movzx edx, byte ptr [eax + 12]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 12], dl
+ movzx edx, byte ptr [eax + 13]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 13], dl
+ movzx edx, byte ptr [eax + 14]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 14], dl
+ movzx edx, byte ptr [eax + 15] // copy alpha.
+ mov byte ptr [edi + 15], dl
+
+ lea eax, [eax + 16]
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(_M_X64)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
+
+#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/files/source/scale.cc b/files/source/scale.cc
index 38910c9..36e3fe5 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -12,2444 +12,341 @@
#include <assert.h>
#include <string.h>
-#include <stdlib.h> // For getenv()
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
-
-// Note: Some SSE2 reference manuals
-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-static bool use_reference_impl_ = false;
-
-LIBYUV_API
-void SetUseReferenceImpl(bool use) {
- use_reference_impl_ = use;
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
}
-// ScaleRowDown2Int also used by planar functions
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define HAS_SCALEROWDOWN2_NEON
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-
-#define HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN38_NEON
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-// 16x2 -> 16x1
-#define HAS_SCALEFILTERROWS_NEON
-void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction);
-
-/**
- * SSE2 downscalers with interpolation.
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-
-
-// Constants for SSSE3 code
-#elif !defined(YUV_DISABLE_ASM) && \
- (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
-
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-// Offsets for source bytes 0 to 9
-CONST uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-CONST uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-CONST uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-CONST uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-CONST uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-CONST uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-CONST uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-CONST vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-CONST uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-CONST uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-CONST uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-CONST uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-CONST uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-CONST uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-#endif
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-
-#define HAS_SCALEROWDOWN2_SSE2
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
- psrld xmm5, 24
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- packuswb xmm0, xmm0
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
- lea eax, [eax + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
- psrlw xmm0, 8
- pand xmm2, xmm7
- pavgw xmm0, xmm2
- packuswb xmm0, xmm0
-
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN8_SSE2
-// Point samples 32 pixels to 4 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
- psrlq xmm5, 56
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1 // 32->16
- packuswb xmm0, xmm0 // 16->8
- packuswb xmm0, xmm0 // 8->4
- sub ecx, 4
- movd dword ptr [edx], xmm0
- lea edx, [edx + 4]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x8 rectangle to 4x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- push edi
- push ebp
- mov eax, [esp + 12 + 4] // src_ptr
- mov esi, [esp + 12 + 8] // src_stride
- mov edx, [esp + 12 + 12] // dst_ptr
- mov ecx, [esp + 12 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- pxor xmm7, xmm7
-
- align 16
- wloop:
- movdqa xmm0, [eax] // average 8 rows to 1
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
- lea ebp, [eax + esi * 4]
- lea eax, [eax + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, [ebp]
- movdqa xmm3, [ebp + 16]
- movdqa xmm4, [ebp + esi]
- movdqa xmm5, [ebp + esi + 16]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- movdqa xmm4, [ebp + esi * 2]
- movdqa xmm5, [ebp + esi * 2 + 16]
- movdqa xmm6, [ebp + edi]
- pavgb xmm4, xmm6
- movdqa xmm6, [ebp + edi + 16]
- pavgb xmm5, xmm6
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- psadbw xmm0, xmm7 // average 32 pixels to 4
- psadbw xmm1, xmm7
- pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
- pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
- por xmm0, xmm1 // -> 3201
- psrlw xmm0, 3
- packuswb xmm0, xmm0
- packuswb xmm0, xmm0
-
- sub ecx, 4
- movd dword ptr [edx], xmm0
- lea edx, [edx + 4]
- jg wloop
-
- pop ebp
- pop edi
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm3, kShuf0
- movdqa xmm4, kShuf1
- movdqa xmm5, kShuf2
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm1
- palignr xmm1, xmm0, 8
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 24
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 16
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx + 24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 16
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx+24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm4, kShuf38a
- movdqa xmm5, kShuf38b
-
- align 16
- xloop:
- movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
- lea eax, [eax + 32]
- pshufb xmm0, xmm4
- pshufb xmm1, xmm5
- paddusb xmm0, xmm1
-
- sub ecx, 12
- movq qword ptr [edx], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edx + 8], xmm1
- lea edx, [edx + 12]
- jg xloop
-
- ret
- }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAc
- movdqa xmm3, kShufAc3
- movdqa xmm4, kScaleAc33
- pxor xmm5, xmm5
-
- align 16
- xloop:
- movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqa xmm6, [eax + esi]
- movhlps xmm1, xmm0
- movhlps xmm7, xmm6
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
- movdqa xmm6, [eax + esi * 2]
- lea eax, [eax + 16]
- movhlps xmm7, xmm6
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
-
- movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- pshufb xmm6, xmm2
-
- movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- pshufb xmm7, xmm3
- paddusw xmm6, xmm7
-
- pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
- packuswb xmm6, xmm6
-
- sub ecx, 6
- movd [edx], xmm6 // write 6 pixels
- psrlq xmm6, 16
- movd [edx + 2], xmm6
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAb0
- movdqa xmm3, kShufAb1
- movdqa xmm4, kShufAb2
- movdqa xmm5, kScaleAb2
-
- align 16
- xloop:
- movdqa xmm0, [eax] // average 2 rows into xmm0
- pavgb xmm0, [eax + esi]
- lea eax, [eax + 16]
-
- movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
- pshufb xmm1, xmm2
- movdqa xmm6, xmm0
- pshufb xmm6, xmm3
- paddusw xmm1, xmm6
- pshufb xmm0, xmm4
- paddusw xmm1, xmm0
-
- pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
- packuswb xmm1, xmm1
-
- sub ecx, 6
- movd [edx], xmm1 // write 6 pixels
- psrlq xmm1, 16
- movd [edx + 2], xmm1
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEADDROWS_SSE2
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked) __declspec(align(16))
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov esi, [esp + 16 + 4] // src_ptr
- mov edx, [esp + 16 + 8] // src_stride
- mov edi, [esp + 16 + 12] // dst_ptr
- mov ecx, [esp + 16 + 16] // dst_width
- mov ebx, [esp + 16 + 20] // height
- pxor xmm4, xmm4
- dec ebx
-
- align 16
- xloop:
- // first row
- movdqa xmm0, [esi]
- lea eax, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- lea esi, [esi + 16]
- mov ebp, ebx
- test ebp, ebp
- je ydone
-
- // sum remaining rows
- align 16
- yloop:
- movdqa xmm2, [eax] // read 16 pixels
- lea eax, [eax + edx] // advance to next row
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- paddusw xmm0, xmm2 // sum 16 words
- paddusw xmm1, xmm3
- sub ebp, 1
- jg yloop
- ydone:
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
- lea edi, [edi + 32]
-
- sub ecx, 16
- jg xloop
-
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
-}
-
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-// Normal formula for bilinear interpolation is:
-// source_y_fraction * row1 + (1 - source_y_fraction) row0
-// SSE2 version using the a single multiply of difference:
-// source_y_fraction * (row1 - row0) + row0
-#define HAS_SCALEFILTERROWS_SSE2_DISABLED
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- cmp eax, 0
- je xloop1
- cmp eax, 128
- je xloop2
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
- pxor xmm4, xmm4
-
- align 16
- xloop:
- movdqa xmm0, [esi] // row0
- movdqa xmm2, [esi + edx] // row1
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop1:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop1
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
- }
-}
-#endif // SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
-#define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- cmp eax, 0
- je xloop1
- cmp eax, 64
- je xloop2
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 16
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
-
- pop edi
- pop esi
- ret
-
- align 16
- xloop1:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop1
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
- }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stridex3 = 0;
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0x8,%%xmm7 \n"
- "lea (%4,%4,2),%3 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa (%0,%4,2),%%xmm2 \n"
- "movdqa 0x10(%0,%4,2),%%xmm3 \n"
- "movdqa (%0,%3,1),%%xmm4 \n"
- "movdqa 0x10(%0,%3,1),%%xmm5 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pand %%xmm7,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(stridex3) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN8_SSE2
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlq $0x38,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%1) \n"
- "lea 0x4(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stridex3 = 0;
- intptr_t row4 = 0;
- asm volatile (
- "lea (%5,%5,2),%3 \n"
- "pxor %%xmm7,%%xmm7 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%5,1),%%xmm2 \n"
- "movdqa 0x10(%0,%5,1),%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa (%0,%5,2),%%xmm2 \n"
- "movdqa 0x10(%0,%5,2),%%xmm3 \n"
- "movdqa (%0,%3,1),%%xmm4 \n"
- "movdqa 0x10(%0,%3,1),%%xmm5 \n"
- "lea (%0,%5,4),%4 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa 0x0(%4),%%xmm2 \n"
- "movdqa 0x10(%4),%%xmm3 \n"
- "movdqa 0x0(%4,%5,1),%%xmm4 \n"
- "movdqa 0x10(%4,%5,1),%%xmm5 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "movdqa 0x0(%4,%5,2),%%xmm4 \n"
- "movdqa 0x10(%4,%5,2),%%xmm5 \n"
- "movdqa 0x0(%4,%3,1),%%xmm6 \n"
- "pavgb %%xmm6,%%xmm4 \n"
- "movdqa 0x10(%4,%3,1),%%xmm6 \n"
- "pavgb %%xmm6,%%xmm5 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psadbw %%xmm7,%%xmm0 \n"
- "psadbw %%xmm7,%%xmm1 \n"
- "pshufd $0xd8,%%xmm0,%%xmm0 \n"
- "pshufd $0x8d,%%xmm1,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "psrlw $0x3,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%1) \n"
- "lea 0x4(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+rm"(dst_width), // %2
- "+r"(stridex3), // %3
- "+r"(row4) // %4
- : "r"(static_cast<intptr_t>(src_stride)) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kShuf0), // %0
- "m"(kShuf1), // %1
- "m"(kShuf2) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%3),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "movdqa 0x10(%0,%3),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
-
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "movdqa 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "m"(kShuf38a), // %3
- "m"(kShuf38b) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
- :
- : "m"(kShufAb0), // %0
- "m"(kShufAb1), // %1
- "m"(kShufAb2), // %2
- "m"(kScaleAb2) // %3
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pavgb (%0,%3,1),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- :
- : "m"(kShufAc), // %0
- "m"(kShufAc3), // %1
- "m"(kScaleAc33) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa (%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- int tmp_height = 0;
- intptr_t tmp_src = 0;
- asm volatile (
- "pxor %%xmm4,%%xmm4 \n"
- "sub $0x1,%5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "mov %0,%3 \n"
- "add %6,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "mov %5,%2 \n"
- "test %2,%2 \n"
- "je 3f \n"
- "2: \n"
- "movdqa (%0),%%xmm2 \n"
- "add %6,%0 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "sub $0x1,%2 \n"
- "jg 2b \n"
- "3: \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x10(%3),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_height), // %2
- "+r"(tmp_src), // %3
- "+r"(src_width), // %4
- "+rm"(src_height) // %5
- : "rm"(static_cast<intptr_t>(src_stride)) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2_DISABLED
-static void ScaleFilterRows_SSE2(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 2f \n"
- "cmp $0x80,%3 \n"
- "je 3f \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "2: \n"
- "movdqa (%1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- ".p2align 4 \n"
- "4: \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "pshufhw $0xff,%%xmm0,%%xmm0 \n"
- "punpckhqdq %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // SSE2_DISABLED
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 2f \n"
- "cmp $0x40,%3 \n"
- "je 3f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "2: \n"
- "movdqa (%1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- ".p2align 4 \n"
- "4: \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "pshufhw $0xff,%%xmm0,%%xmm0 \n"
- "punpckhqdq %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-#endif // defined(__x86_64__) || defined(__i386__)
-
-// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[2];
- dst += 2;
- src_ptr += 4;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = src_ptr[0];
- }
-}
-
-void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
- dst += 2;
- s += 4;
- t += 4;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- }
-}
-
-static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[4];
- dst += 2;
- src_ptr += 8;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = src_ptr[0];
- }
-}
-
-static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- intptr_t stride = src_stride;
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
- dst += 2;
- src_ptr += 8;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- }
-}
-
-// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
-// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
-static const int kMaxOutputWidth = 640;
-static const int kMaxRow12 = kMaxOutputWidth * 2;
-
-static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[8];
- dst += 2;
- src_ptr += 16;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = src_ptr[0];
- }
-}
-
-// Note calling code checks width is less than max and if not
-// uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
- assert(dst_width <= kMaxOutputWidth);
- ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
- ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
- src_row + kMaxOutputWidth,
- dst_width * 2);
- ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
-}
-
-static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- uint8* dend = dst + dst_width;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[1];
- dst[2] = src_ptr[3];
- dst += 3;
- src_ptr += 4;
- } while (dst < dend);
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = d + dst_width;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 * 3 + b0 + 2) >> 2;
- d[1] = (a1 * 3 + b1 + 2) >> 2;
- d[2] = (a2 * 3 + b2 + 2) >> 2;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = d + dst_width;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 + b0 + 1) >> 1;
- d[1] = (a1 + b1 + 1) >> 1;
- d[2] = (a2 + b2 + 1) >> 1;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (static_cast<int>(a) + \
- ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- for (int j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-
-static const int kMaxInputWidth = 2560;
-
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-// Filter row to 3/4
-static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- const uint8* s = src_ptr;
- uint8* dend = dst_ptr + dst_width;
- do {
- dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- dst_ptr += 3;
- s += 4;
- } while (dst_ptr < dend);
-}
-
-#define HAS_SCALEROWDOWN34_SSE2_DISABLED
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
- ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
- ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
- ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
- ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-#endif
-
-static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- assert(dst_width % 3 == 0);
- for (int x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[3];
- dst[2] = src_ptr[6];
- dst += 3;
- src_ptr += 8;
- }
-}
-
-// 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- intptr_t stride = src_stride;
- for (int i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- intptr_t stride = src_stride;
- for (int i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// C version 8x2 -> 8x1
-static void ScaleFilterRows_C(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- assert(dst_width > 0);
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- uint8* end = dst_ptr + dst_width;
- do {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
- dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
- dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
- dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
- dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
- dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
- src_ptr += 8;
- src_ptr1 += 8;
- dst_ptr += 8;
- } while (dst_ptr < end);
- dst_ptr[0] = dst_ptr[-1];
-}
-
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- assert(src_width > 0);
- assert(src_height > 0);
- for (int x = 0; x < src_width; ++x) {
- const uint8* s = src_ptr + x;
- int sum = 0;
- for (int y = 0; y < src_height; ++y) {
- sum += s[0];
- s += src_stride;
- }
- dst_ptr[x] = sum;
- }
-}
-
-/**
- * Scale plane, 1/2
- *
- * This is an optimized version for scaling down a plane to 1/2 of
- * its original size.
- *
- */
-static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
+static void ScalePlaneDown2(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
+ enum FilterMode filtering) {
+ int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
- filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+ filtering == kFilterNone ? ScaleRowDown2_C :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+ int row_stride = src_stride << 1;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
}
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
- ScaleRowDown2_Unaligned_SSE2;
- if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+ ScaleRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+ ScaleRowDown2Box_NEON);
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
+ ScaleRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
+ ScaleRowDown2Box_SSSE3);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+ ScaleRowDown2Box_Any_AVX2);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+ ScaleRowDown2Box_AVX2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ?
+ ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+ }
+#endif
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
// TODO(fbarchard): Loop through source height to allow odd height.
- for (int y = 0; y < dst_height; ++y) {
+ for (y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 1);
+ src_ptr += row_stride;
dst_ptr += dst_stride;
}
}
-/**
- * Scale plane, 1/4
- *
- * This is an optimized version for scaling down a plane to 1/4 of
- * its original size.
- */
-static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
+static void ScalePlaneDown2_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering == kFilterNone ? ScaleRowDown2_16_C :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+ ScaleRowDown2Box_16_C);
+ int row_stride = src_stride << 1;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+ ScaleRowDown2_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+ ScaleRowDown2Box_16_SSE2);
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ?
+ ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
+ enum FilterMode filtering) {
+ int y;
void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
- filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+ filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+ int row_stride = src_stride << 2;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
#if defined(HAS_SCALEROWDOWN4_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(dst_width, 4)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+ }
}
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
}
#endif
- for (int y = 0; y < dst_height; ++y) {
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 2);
+ src_ptr += row_stride;
dst_ptr += dst_stride;
}
}
-/**
- * Scale plane, 1/8
- *
- * This is an optimized version for scaling down a plane to 1/8
- * of its original size.
- *
- */
-static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
- filtering && (dst_width <= kMaxOutputWidth) ?
- ScaleRowDown8Int_C : ScaleRowDown8_C;
-#if defined(HAS_SCALEROWDOWN8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
- ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+static void ScalePlaneDown4_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+ int row_stride = src_stride << 2;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+ ScaleRowDown4_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+ ScaleRowDown4_16_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
}
#endif
- for (int y = 0; y < dst_height; ++y) {
- ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 3);
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
dst_ptr += dst_stride;
}
}
-/**
- * Scale plane down, 3/4
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- assert(dst_width % 3 == 0);
+ enum FilterMode filtering) {
+ int y;
void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_C;
ScaleRowDown34_1 = ScaleRowDown34_C;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
}
#if defined(HAS_SCALEROWDOWN34_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (TestCpuFlag(kCpuHasNEON)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+ }
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+ }
+ }
}
#endif
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#if defined(HAS_SCALEROWDOWN34_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
}
}
#endif
- for (int y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
@@ -2457,7 +354,7 @@
// Remainder 1 or 2 rows with last row vertically unfiltered
if ((dst_height % 3) == 2) {
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
@@ -2466,78 +363,189 @@
}
}
-/**
- * Scale plane, 3/8
- *
- * This is an optimized version for scaling down a plane to 3/8
- * of its original size.
- *
- * Uses box filter arranges like this
- * aaabbbcc -> abc
- * aaabbbcc def
- * aaabbbcc ghi
- * dddeeeff
- * dddeeeff
- * dddeeeff
- * ggghhhii
- * ggghhhii
- * Boxes are 3x3, 2x3, 3x2 and 2x2
- */
-static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
+static void ScalePlaneDown34_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
assert(dst_width % 3 == 0);
- void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_C;
- ScaleRowDown38_2 = ScaleRowDown38_C;
+ ScaleRowDown34_0 = ScaleRowDown34_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_16_C;
} else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
}
-#if defined(HAS_SCALEROWDOWN38_NEON)
- if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
} else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
}
}
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
} else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
}
}
#endif
- for (int y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+ dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
}
// Remainder 1 or 2 rows with last row vertically unfiltered
if ((dst_height % 3) == 2) {
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc def
+// aaabbbcc ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+ }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+ }
+ if (dst_width % 12 == 0 && !filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ }
+ if (dst_width % 6 == 0 && filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 3;
dst_ptr += dst_stride;
ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
@@ -2546,36 +554,101 @@
}
}
-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
- ptrdiff_t src_stride, const uint8* src_ptr) {
- assert(iboxwidth > 0);
- assert(iboxheight > 0);
- uint32 sum = 0u;
- for (int y = 0; y < iboxheight; ++y) {
- for (int x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
+static void ScalePlaneDown38_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_16_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
}
- src_ptr += src_stride;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+ uint32 sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
}
return sum;
}
-static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
- int x, int dx, ptrdiff_t src_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- for (int i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- x += dx;
- int boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
- (boxwidth * boxheight);
- }
-}
-
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
- assert(iboxwidth > 0);
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
uint32 sum = 0u;
- for (int x = 0; x < iboxwidth; ++x) {
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
sum += src_ptr[x];
}
return sum;
@@ -2583,339 +656,895 @@
static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
+ int i;
int scaletbl[2];
- int minboxwidth = (dx >> 16);
- scaletbl[0] = 65536 / (minboxwidth * boxheight);
- scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
- int *scaleptr = scaletbl - minboxwidth;
- for (int i = 0; i < dst_width; ++i) {
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
- int boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
+ scaletbl[boxwidth - minboxwidth] >> 16;
+ }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+ scaletbl[boxwidth - minboxwidth] >> 16;
+ }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int scaleval = 65536 / boxheight;
+ int i;
+ src_ptr += (x >> 16);
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = src_ptr[i] * scaleval >> 16;
}
}
static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
- int boxwidth = (dx >> 16);
+ int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
- for (int i = 0; i < dst_width; ++i) {
+ int i;
+ x >>= 16;
+ for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth;
}
}
-/**
- * Scale plane down to any dimensions, with interpolation.
- * (boxfilter).
- *
- * Same method as SimpleScale, which is fixed point, outputting
- * one pixel of destination using fixed point (16.16) to step
- * through source, sampling a box of pixel with simple
- * averaging.
- */
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) {
+ int boxwidth = MIN1(dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
- assert(dst_width > 0);
- assert(dst_height > 0);
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- int maxy = (src_height << 16);
- if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
- dst_height * 2 > src_height) {
- uint8* dst = dst_ptr;
- for (int j = 0; j < dst_height; ++j) {
- int iy = y >> 16;
- const uint8* src = src_ptr + iy * src_stride;
- y += dy;
- if (y > maxy) {
- y = maxy;
- }
- int boxheight = (y >> 16) - iy;
- ScalePlaneBoxRow_C(dst_width, boxheight,
- x, dx, src_stride,
- src, dst);
- dst += dst_stride;
- }
- } else {
- SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
- void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height)=
- ScaleAddRows_C;
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint16.
+ align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr);
- if (dx & 0xffff) {
- ScaleAddCols = ScaleAddCols2_C;
- } else {
- ScaleAddCols = ScaleAddCols1_C;
+ const uint16* src_ptr, uint8* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_C:
+ ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+ void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+ ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleAddRow = ScaleAddRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_SSE2;
+ }
}
-#if defined(HAS_SCALEADDROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
- ScaleAddRows = ScaleAddRows_SSE2;
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleAddRow = ScaleAddRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ ScaleAddRow = ScaleAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleAddRow = ScaleAddRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_NEON;
+ }
}
#endif
- for (int j = 0; j < dst_height; ++j) {
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
int iy = y >> 16;
const uint8* src = src_ptr + iy * src_stride;
y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
+ if (y > max_y) {
+ y = max_y;
}
- int boxheight = (y >> 16) - iy;
- ScaleAddRows(src, src_stride, row, src_width, boxheight);
- ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row16, 0, src_width * 2);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint16 *)(row16), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
dst_ptr += dst_stride;
}
+ free_aligned_buffer_64(row16);
}
}
-/**
- * Scale plane to/from any dimensions, with interpolation.
- */
-static void ScalePlaneBilinearSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- for (int i = 0; i < dst_height; ++i) {
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+static void ScalePlaneBox_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr) {
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint32.
+ align_buffer_64(row32, src_width * 4);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+ void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+ ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_16_SSE2;
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint16* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row32, 0, src_width * 4);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint32 *)(row32), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row32);
+ }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- int yf = y & 0xffff;
- const uint8* src0 = src_ptr + yi * src_stride;
- const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
- uint8* dst = dst_ptr;
- for (int j = 0; j < dst_width; ++j) {
- int xi = x >> 16;
- int xf = x & 0xffff;
- int x1 = (xi < src_width - 1) ? xi + 1 : xi;
- int a = src0[xi];
- int b = src0[x1];
- int r0 = BLENDER(a, b, xf);
- a = src1[xi];
- b = src1[x1];
- int r1 = BLENDER(a, b, xf);
- *dst++ = BLENDER(r0, r1, yf);
- x += dx;
- if (x > maxx)
- x = maxx;
+ const uint8* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
}
dst_ptr += dst_stride;
y += dy;
- if (y > maxy)
- y = maxy;
+ if (y > max_y) {
+ y = max_y;
+ }
}
+ free_aligned_buffer_64(row);
}
-/**
- * Scale plane to/from any dimensions, with bilinear
- * interpolation.
- */
-void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- assert(dst_width > 0);
- assert(dst_height > 0);
- if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
- ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width * 2);
- } else {
- SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
- void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) =
- ScaleFilterRows_C;
-#if defined(HAS_SCALEFILTERROWS_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ScaleFilterRows = ScaleFilterRows_NEON;
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+ void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
}
+ }
#endif
-#if defined(HAS_SCALEFILTERROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
- ScaleFilterRows = ScaleFilterRows_SSE2;
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
}
+ }
#endif
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
- ScaleFilterRows = ScaleFilterRows_SSSE3;
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
}
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
#endif
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- for (int j = 0; j < dst_height; ++j) {
- int yi = y >> 16;
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
int yf = (y >> 8) & 255;
- const uint8* src = src_ptr + yi * src_stride;
- ScaleFilterRows(row, src, src_stride, src_width, yf);
- ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
+ InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_C : ScaleCols_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
dst_ptr += dst_stride;
y += dy;
- if (y > maxy) {
- y = maxy;
- }
}
+ free_aligned_buffer_64(row);
}
}
-/**
- * Scale plane to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
- */
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_16_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4);
+
+ uint16* rowptr = (uint16*)row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
static void ScalePlaneSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- for (int j = 0; j < dst_height; ++j) {
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
- int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
- uint8* dst = dst_ptr;
- for (int i = 0; i < dst_width; ++i) {
- *dst++ = src[x >> 16];
- x += dx;
+ int i;
+ void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) = ScaleCols_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_SSE2;
}
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
}
}
-/**
- * Scale plane to/from any dimensions.
- */
-static void ScalePlaneAnySize(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- if (!filtering) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- // fall back to non-optimized version
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
+static void ScalePlaneSimple_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) = ScaleCols_16_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
-/**
- * Scale plane down, any size
- *
- * This is an optimized version for scaling down a plane to any size.
- * The current implementation is ~10 times faster compared to the
- * reference implementation for e.g. XGA->LowResPAL
- *
- */
-static void ScalePlaneDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- if (!filtering) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
- // between 1/2x and 1x use bilinear
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
}
}
// Scale a plane.
-// This function in turn calls a scaling function suitable for handling
-// the desired resolutions.
+// This function dispatches to a specialized scaler based on scale factor.
LIBYUV_API
void ScalePlane(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
- FilterMode filtering) {
-#ifdef CPU_X86
- // environment variable overrides for testing.
- char *filter_override = getenv("LIBYUV_FILTER");
- if (filter_override) {
- filtering = (FilterMode)atoi(filter_override); // NOLINT
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height, filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
}
-#endif
+
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
// Straight copy.
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
- } else if (dst_width <= src_width && dst_height <= src_height) {
+ return;
+ }
+ if (dst_width == src_width && filtering != kFilterBox) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height,
+ dst_width, dst_height,
+ src_stride, dst_stride, src, dst,
+ 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
- if (use_reference_impl_) {
- // For testing, allow the optimized versions to be disabled.
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
+ if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
// optimized, 3/4
ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
// optimized, 1/2
ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
// 3/8 rounded up for odd sized chroma height.
- } else if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
+ if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
// optimized, 3/8
ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
- filtering != kFilterBilinear) {
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ (filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
- filtering != kFilterBilinear) {
- // optimized, 1/8
- ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else {
- // Arbitrary downsample
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ return;
}
- } else {
- // Arbitrary scale up and/or down.
- ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
}
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+ int src_width, int src_height,
+ uint16* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height, filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical_16(src_height,
+ dst_width, dst_height,
+ src_stride, dst_stride, src, dst,
+ 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
+ // optimized, 3/8
+ ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
+ // optimized, 1/4
+ ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
}
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.
-#define UNDER_ALLOCATED_HACK 1
-
LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -2925,48 +1554,16 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int dst_width, int dst_height,
- FilterMode filtering) {
- if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 ||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1;
}
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- int halfheight = (src_height + 1) >> 1;
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
-
-#ifdef UNDER_ALLOCATED_HACK
- // If caller passed width / 2 for stride, adjust halfwidth to match.
- if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
- src_halfwidth = src_width >> 1;
- }
- if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
- dst_halfwidth = dst_width >> 1;
- }
- // If caller used height / 2 when computing src_v, it will point into what
- // should be the src_u plane. Detect this and reduce halfheight to match.
- int uv_src_plane_size = src_halfwidth * src_halfheight;
- if ((src_height & 1) &&
- (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
- src_halfheight = src_height >> 1;
- }
- int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
- if ((dst_height & 1) &&
- (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
- dst_halfheight = dst_height >> 1;
- }
-#endif
ScalePlane(src_y, src_stride_y, src_width, src_height,
dst_y, dst_stride_y, dst_width, dst_height,
@@ -2980,6 +1577,38 @@
return 0;
}
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+ const uint16* src_u, int src_stride_u,
+ const uint16* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint16* dst_y, int dst_stride_y,
+ uint16* dst_u, int dst_stride_u,
+ uint16* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
+ return 0;
+}
+
// Deprecated api
LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
@@ -2988,90 +1617,53 @@
uint8* dst_y, uint8* dst_u, uint8* dst_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
- bool interpolate) {
- if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- int halfheight = (src_height + 1) >> 1;
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
- FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
-
-#ifdef UNDER_ALLOCATED_HACK
- // If caller passed width / 2 for stride, adjust halfwidth to match.
- if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
- src_halfwidth = src_width >> 1;
- }
- if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
- dst_halfwidth = dst_width >> 1;
- }
- // If caller used height / 2 when computing src_v, it will point into what
- // should be the src_u plane. Detect this and reduce halfheight to match.
- int uv_src_plane_size = src_halfwidth * src_halfheight;
- if ((src_height & 1) &&
- (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
- src_halfheight = src_height >> 1;
- }
- int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
- if ((dst_height & 1) &&
- (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
- dst_halfheight = dst_height >> 1;
- }
-#endif
-
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
- return 0;
+ LIBYUV_BOOL interpolate) {
+ return I420Scale(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ dst_width, dst_height,
+ interpolate ? kFilterBox : kFilterNone);
}
// Deprecated api
LIBYUV_API
int ScaleOffset(const uint8* src, int src_width, int src_height,
uint8* dst, int dst_width, int dst_height, int dst_yoffset,
- bool interpolate) {
- if (!src || src_width <= 0 || src_height <= 0 ||
- !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
- dst_yoffset >= dst_height) {
- return -1;
- }
- dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
- int aheight = dst_height - dst_yoffset * 2; // actual output height
+ LIBYUV_BOOL interpolate) {
+ // Chroma requires offset to multiple of 2.
+ int dst_yoffset_even = dst_yoffset & ~1;
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ int aheight = dst_height - dst_yoffset_even * 2; // actual output height
const uint8* src_y = src;
const uint8* src_u = src + src_width * src_height;
const uint8* src_v = src + src_width * src_height +
src_halfwidth * src_halfheight;
- uint8* dst_y = dst + dst_yoffset * dst_width;
+ uint8* dst_y = dst + dst_yoffset_even * dst_width;
uint8* dst_u = dst + dst_width * dst_height +
- (dst_yoffset >> 1) * dst_halfwidth;
+ (dst_yoffset_even >> 1) * dst_halfwidth;
uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
- (dst_yoffset >> 1) * dst_halfwidth;
- return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
- src_width, src_height, dst_y, dst_u, dst_v, dst_width,
- dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+ (dst_yoffset_even >> 1) * dst_halfwidth;
+ if (!src || src_width <= 0 || src_height <= 0 ||
+ !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+ dst_yoffset_even >= dst_height) {
+ return -1;
+ }
+ return I420Scale(src_y, src_width,
+ src_u, src_halfwidth,
+ src_v, src_halfwidth,
+ src_width, src_height,
+ dst_y, dst_width,
+ dst_u, dst_halfwidth,
+ dst_v, dst_halfwidth,
+ dst_width, aheight,
+ interpolate ? kFilterBox : kFilterNone);
}
#ifdef __cplusplus
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
new file mode 100644
index 0000000..ed76a9e
--- /dev/null
+++ b/files/source/scale_any.cc
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
+ int dst_width, int x, int dx) { \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, \
+ dst_width & MASK, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
+ uint8* dst_ptr, int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
+
+// Fixed scale down for odd source width. Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
+ uint8* dst_ptr, int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
+ ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
+ 2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+ ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+ 2, 1, 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
+ 2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+ ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_C, 2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
+ 4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+ 4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+ 4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+ ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+ ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+ ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+ ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+ ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+ ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+ ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+ ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+ ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+ ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+ ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+ ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+ ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+ ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+ ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+ ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+ ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+ ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8* dst_ptr, int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
+ src_stepx, dst_ptr + n * BPP, r); \
+ }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+ ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+ ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+ ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+ ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 5d4e1ac..17f51ae 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -12,1023 +12,847 @@
#include <assert.h>
#include <string.h>
-#include <stdlib.h> // For getenv()
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyARGB
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
-
-// ARGB scaling uses bilinear or point, but not box filter.
-/**
- * SSE2 downscalers with bilinear interpolation.
- */
-
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-
-#define HAS_SCALEARGBROWDOWN2_SSE2
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- shufps xmm0, xmm1, 0x88
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
}
-// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push ebx
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- // src_stride ignored
- mov ebx, [esp + 8 + 12] // src_stepx
- mov edx, [esp + 8 + 16] // dst_ptr
- mov ecx, [esp + 8 + 20] // dst_width
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 16
- wloop:
- movd xmm0, [eax]
- movd xmm1, [eax + ebx]
- punpckldq xmm0, xmm1
- movd xmm2, [eax + ebx * 2]
- movd xmm3, [eax + edi]
- lea eax, [eax + ebx * 4]
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop ebx
- ret
- }
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_ptr
- mov esi, [esp + 12 + 8] // src_stride
- mov ebx, [esp + 12 + 12] // src_stepx
- mov edx, [esp + 12 + 16] // dst_ptr
- mov ecx, [esp + 12 + 20] // dst_width
- lea esi, [eax + esi] // row1 pointer
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 16
- wloop:
- movq xmm0, qword ptr [eax] // row0 4 pairs
- movhps xmm0, qword ptr [eax + ebx]
- movq xmm1, qword ptr [eax + ebx * 2]
- movhps xmm1, qword ptr [eax + edi]
- lea eax, [eax + ebx * 4]
- movq xmm2, qword ptr [esi] // row1 4 pairs
- movhps xmm2, qword ptr [esi + ebx]
- movq xmm3, qword ptr [esi + ebx * 2]
- movhps xmm3, qword ptr [esi + edi]
- lea esi, [esi + ebx * 4]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
-#ifndef SSE2_DISABLED
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
-__declspec(naked) __declspec(align(16))
-void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- cmp eax, 0
- je xloop1
- cmp eax, 128
- je xloop2
-
- movd xmm5, eax // xmm5 = y fraction
- punpcklbw xmm5, xmm5
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
- pxor xmm4, xmm4
-
- // f * row1 + (1 - frac) row0
- // frac * (row1 - row0) + row0
- align 16
- xloop:
- movdqa xmm0, [esi] // row0
- movdqa xmm2, [esi + edx] // row1
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- psubw xmm2, xmm0 // row1 - row0
- psubw xmm3, xmm1
- pmulhw xmm2, xmm5 // scale diff
- pmulhw xmm3, xmm5
- paddw xmm0, xmm2 // sum rows
- paddw xmm1, xmm3
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
- pop edi
- pop esi
- ret
-
- align 16
- xloop1:
- movdqa xmm0, [esi]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop1
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
- }
-}
-#endif // SSE2_DISABLED
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
-#define HAS_SCALEARGBFILTERROWS_SSSE3
-__declspec(naked) __declspec(align(16))
-void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
- shr eax, 1
- cmp eax, 0
- je xloop1
- cmp eax, 64
- je xloop2
- movd xmm0, eax // high fraction 0..127
- neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
- punpcklbw xmm5, xmm0
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- align 16
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
- pop edi
- pop esi
- ret
-
- align 16
- xloop1:
- movdqa xmm0, [esi]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop1
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
-
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
- }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEARGBROWDOWN2_SSE2
-static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_ptr 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width) {
- intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
- intptr_t src_stepx_x12 = 0;
- asm volatile (
- "lea 0x0(,%1,4),%1 \n"
- "lea (%1,%1,2),%4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movd (%0),%%xmm0 \n"
- "movd (%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd (%0,%1,2),%%xmm2 \n"
- "movd (%0,%4,1),%%xmm3 \n"
- "lea (%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_ptr), // %2
- "+r"(dst_width), // %3
- "+r"(src_stepx_x12) // %4
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_ptr 16 byte aligned.
-static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride, int src_stepx,
- uint8* dst_ptr, int dst_width) {
- intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
- intptr_t src_stepx_x12 = 0;
- intptr_t row1 = static_cast<intptr_t>(src_stride);
- asm volatile (
- "lea 0x0(,%1,4),%1 \n"
- "lea (%1,%1,2),%4 \n"
- "lea (%0,%5,1),%5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps (%0,%1,1),%%xmm0 \n"
- "movq (%0,%1,2),%%xmm1 \n"
- "movhps (%0,%4,1),%%xmm1 \n"
- "lea (%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps (%5,%1,1),%%xmm2 \n"
- "movq (%5,%1,2),%%xmm3 \n"
- "movhps (%5,%4,1),%%xmm3 \n"
- "lea (%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_ptr), // %2
- "+rm"(dst_width), // %3
- "+r"(src_stepx_x12), // %4
- "+r"(row1) // %5
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#ifndef SSE2_DISABLED
-// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
-void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 2f \n"
- "cmp $0x80,%3 \n"
- "je 3f \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "2: \n"
- "movdqa (%1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- ".p2align 4 \n"
- "4: \n"
- "shufps $0xff,%%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // SSE2_DISABLED
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-#define HAS_SCALEARGBFILTERROWS_SSSE3
-void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 2f \n"
- "cmp $0x40,%3 \n"
- "je 3f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "2: \n"
- "movdqa (%1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- "4: \n"
- ".p2align 4 \n"
- "shufps $0xff,%%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-#endif // defined(__x86_64__) || defined(__i386__)
-
-static void ScaleARGBRowDown2_C(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width) {
- const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
- uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-
- for (int x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[0];
- dst[1] = src[2];
- src += 4;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[0];
- }
-}
-
-static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
- src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
- dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
- src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
- dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
- src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
- dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
- src_ptr += 8;
- dst_ptr += 4;
- }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- int src_stepx,
- uint8* dst_ptr, int dst_width) {
- const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
- uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
-
- for (int x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[0];
- dst[1] = src[src_stepx];
- src += src_stepx * 2;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[0];
- }
-}
-
-static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_ptr, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
- src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
- dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
- src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
- dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
- src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
- dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
- src_ptr += src_stepx * 4;
- dst_ptr += 4;
- }
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-
-#define BLENDER1(a, b, f) (static_cast<int>(a) + \
- ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
-
-#define BLENDERC(a, b, f, s) static_cast<uint32>( \
- BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-
-#define BLENDER(a, b, f) \
- BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
- BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
- uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
- for (int j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src[xi];
- b = src[xi + 1];
- dst[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-
-static const int kMaxInputWidth = 2560;
-
-// C version 2x2 -> 2x1
-void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- assert(dst_width > 0);
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- uint8* end = dst_ptr + (dst_width << 2);
- do {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
- dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
- dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
- dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
- dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
- dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
- src_ptr += 8;
- src_ptr1 += 8;
- dst_ptr += 8;
- } while (dst_ptr < end);
- // Duplicate the last pixel (4 bytes) for filtering.
- dst_ptr[0] = dst_ptr[-4];
- dst_ptr[1] = dst_ptr[-3];
- dst_ptr[2] = dst_ptr[-2];
- dst_ptr[3] = dst_ptr[-1];
-}
-
-/**
- * ScaleARGB ARGB, 1/2
- *
- * This is an optimized version for scaling down a ARGB to 1/2 of
- * its original size.
- *
- */
-static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
- filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ filtering == kFilterNone ? ScaleARGBRowDown2_C :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+ ScaleARGBRowDown2Box_C);
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ } else {
+ src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ }
+
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
- ScaleARGBRowDown2_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+ ScaleARGBRowDown2Box_Any_SSE2);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+ ScaleARGBRowDown2Box_SSE2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+ ScaleARGBRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+ ScaleARGBRowDown2Box_NEON);
+ }
}
#endif
- // TODO(fbarchard): Loop through source height to allow odd height.
- for (int y = 0; y < dst_height; ++y) {
- ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 1);
- dst_ptr += dst_stride;
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
}
}
-/**
- * ScaleARGB ARGB Even
- *
- * This is an optimized version for scaling down a ARGB to even
- * multiple of its original size.
- *
- */
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ int j;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+ row + kRowSize, dst_width * 2);
+ ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
static void ScaleARGBDownEven(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
- void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
- int src_step, uint8* dst_ptr, int dst_width) =
- filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
- ScaleARGBRowDownEven_SSE2;
- }
-#endif
- int src_step = src_width / dst_width;
- // Adjust to point to center of box.
- int row_step = src_height / dst_height;
- int row_stride = row_step * src_stride;
- src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
- for (int y = 0; y < dst_height; ++y) {
- ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
- src_ptr += row_stride;
- dst_ptr += dst_stride;
- }
-}
-/**
- * ScaleARGB ARGB to/from any dimensions, with bilinear
- * interpolation.
- */
-
-static void ScaleARGBBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- assert(dst_width > 0);
- assert(dst_height > 0);
- assert(src_width <= kMaxInputWidth);
- SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
- void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) =
- ScaleARGBFilterRows_C;
-#if defined(HAS_SCALEARGBFILTERROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
- ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
- }
-#endif
-#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
- ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
- }
-#endif
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- for (int j = 0; j < dst_height; ++j) {
- int yi = y >> 16;
- int yf = (y >> 8) & 255;
- const uint8* src = src_ptr + yi * src_stride;
- ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
- ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
- dst_ptr += dst_stride;
- y += dy;
- if (y > maxy) {
- y = maxy;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+ ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
}
}
-}
-
-// Scales a single row of pixels using point sampling.
-// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
-// interpolation off, and argb pixels instead of yuv.
-static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
- uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
- for (int j = 0; j < dst_width - 1; j += 2) {
- dst[0] = src[x >> 16];
- x += dx;
- dst[1] = src[x >> 16];
- x += dx;
- dst += 2;
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+ ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+ ScaleARGBRowDownEven_NEON;
+ }
}
- if (dst_width & 1) {
- dst[0] = src[x >> 16];
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
}
}
-/**
- * ScaleARGB ARGB to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
- */
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64 xlast = x + (int64)(dst_width - 1) * dx;
+ int64 xl = (dx >= 0) ? x : xlast;
+ int64 xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ src_argb += xl * 4;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ {
+ align_buffer_64(row, clip_src_width * 4);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_argb + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_DSPR2;
+ }
+#endif
+
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+#endif
+
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int yi = y >> 16;
+ int uv_yi = yi >> kYShift;
+ const uint8* src_row_y = src_y + yi * src_stride_y;
+ const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_row, src_width * 4);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+ ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+ if (src_height > 1) {
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+ if (src_height > 2) {
+ src_row_y += src_stride_y;
+ if (!(yi & 1)) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ uv_yi = yi >> kYShift;
+ src_row_y = src_y + yi * src_stride_y;
+ src_row_u = src_u + uv_yi * src_stride_u;
+ src_row_v = src_v + uv_yi * src_stride_v;
+ }
+ if (yi != lasty) {
+ // TODO(fbarchard): Convert the clipped region of row.
+ I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+ ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
static void ScaleARGBSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
- int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
- for (int i = 0; i < dst_height; ++i) {
- ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
- dst_ptr += dst_stride;
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ int j;
+ void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_argb += dst_stride;
y += dy;
}
}
-/**
- * ScaleARGB ARGB to/from any dimensions.
- */
-static void ScaleARGBAnySize(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- if (!filtering || (src_width > kMaxInputWidth)) {
- ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
-
// ScaleARGB a ARGB.
-//
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
-
static void ScaleARGB(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
- FilterMode filtering) {
-#ifdef CPU_X86
- // environment variable overrides for testing.
- char *filter_override = getenv("LIBYUV_FILTER");
- if (filter_override) {
- filtering = (FilterMode)atoi(filter_override); // NOLINT
- }
-#endif
- if (dst_width == src_width && dst_height == src_height) {
- // Straight copy.
- ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
- return;
- }
- if (2 * dst_width == src_width && 2 * dst_height == src_height) {
- // Optimized 1/2.
- ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- int scale_down_x = src_width / dst_width;
- int scale_down_y = src_height / dst_height;
- if (dst_width * scale_down_x == src_width &&
- dst_height * scale_down_y == src_height) {
- if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
- // Optimized even scale down. ie 4, 6, 8, 10x
- ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- return;
- }
- if ((scale_down_x & 1) && (scale_down_y & 1)) {
- filtering = kFilterNone;
- }
- }
- // Arbitrary scale up and/or down.
- ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
-}
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
-// ScaleARGB an ARGB image.
-LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
- FilterMode filtering) {
- if (!src_argb || src_width <= 0 || src_height == 0 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
- // Negative height means invert the image.
+ // Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src_argb = src_argb + (src_height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64 clipf = (int64)(clip_x) * dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 4;
+ dst += clip_x * 4;
+ }
+ if (clip_y) {
+ int64 clipf = (int64)(clip_y) * dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleARGBDown2(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleARGBDown4Box(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+ dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+ }
+ }
+ }
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, y, dy, 4, filtering);
+ return;
+ }
+ if (filtering && dy < 65536) {
+ ScaleARGBBilinearUp(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ if (filtering) {
+ ScaleARGBBilinearDown(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+ clip_x < 0 || clip_y < 0 ||
+ clip_width > 32768 || clip_height > 32768 ||
+ (clip_x + clip_width) > dst_width ||
+ (clip_y + clip_height) > dst_height) {
+ return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
- filtering);
+ clip_x, clip_y, clip_width, clip_height, filtering);
return 0;
}
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint32 src_fourcc,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ uint32 dst_fourcc,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+ int r;
+ I420ToARGB(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ argb_buffer, src_width * 4,
+ src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ clip_x, clip_y, clip_width, clip_height,
+ filtering);
+ free(argb_buffer);
+ return r;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
new file mode 100644
index 0000000..3507aa4
--- /dev/null
+++ b/files/source/scale_common.cc
@@ -0,0 +1,1159 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ const uint16* s = src_ptr;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst += 1;
+ s += 2;
+ t += 2;
+ }
+ dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+ ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// inteluses 7 bit math with rounding.
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+ (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+ ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+ BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+ BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int y, int dy,
+ int bpp, enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher bpp.
+ int dst_width_bytes = dst_width * bpp;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(bpp >= 1 && bpp <= 4);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_DSPR2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride,
+ src_stride, dst_width_bytes, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+void ScalePlaneVertical_16(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_argb, uint16* dst_argb,
+ int x, int y, int dy,
+ int wpp, enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_16_DSPR2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride,
+ src_stride, dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ if (src_width < 0) {
+ src_width = -src_width;
+ }
+ if (src_height < 0) {
+ src_height = -src_height;
+ }
+ if (filtering == kFilterBox) {
+ // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ }
+ if (filtering == kFilterBilinear) {
+ if (src_height == 1) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+ if (dst_height == src_height || dst_height * 3 == src_height) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+ // avoid reading 2 pixels horizontally that causes memory exception.
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+ if (dst_width == src_width || dst_width * 3 == src_width) {
+ filtering = kFilterNone;
+ }
+ }
+ return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+ return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+ return (int)((((int64)(num) << 16) - 0x00010001) /
+ (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering,
+ int* x, int* y, int* dx, int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ // Check for 1 pixel and avoid FixedDiv overflow.
+ if (dst_width == 1 && src_width >= 32768) {
+ dst_width = src_width;
+ }
+ if (dst_height == 1 && src_height >= 32768) {
+ dst_height = src_height;
+ }
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_height > 1) {
+ *dy = FixedDiv1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ // src_width = -src_width; // Caller must do this.
+ }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
new file mode 100644
index 0000000..e2f8854
--- /dev/null
+++ b/files/source/scale_gcc.cc
@@ -0,0 +1,1322 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stridex3;
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "=&r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
+ MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(src_stride * 3)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "lea " MEMLEA(0xc,1) ",%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ asm volatile (
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
+ "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ );
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 =
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 =
+ { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ intptr_t x0, x1, temp_pixel;
+ asm volatile (
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x2,0) ",%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2," MEMACCESS(0) " \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "=&a"(temp_pixel), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1), // %4
+#if defined(__x86_64__)
+ "+rm"(dst_width) // %5
+#else
+ "+m"(dst_width) // %5
+#endif
+ : "rm"(x), // %6
+ "rm"(dx), // %7
+#if defined(__x86_64__)
+ "x"(kFsub80), // %8
+ "x"(kFadd40) // %9
+#else
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
+#endif
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", "xmm0", "xmm1"
+ );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ LABELALIGN
+ "1: \n"
+ "movd " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
+ MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "=&r"(src_stepx_x12) // %4
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ intptr_t row1 = (intptr_t)(src_stride);
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
+ MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
+ MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "movq " MEMACCESS(5) ",%%xmm2 \n"
+ MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
+ MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
+ MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "=&r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3"
+ );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0, x1;
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ LABELALIGN
+ "40: \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
+ MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x8,2) ",%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "99: \n"
+ : "=&a"(x0), // %0
+ "=&d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(0) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ :: "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0, x1;
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "psrlw $0x9,%%xmm1 \n"
+ MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(0) " \n"
+
+ LABELALIGN
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile (
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx"
+ );
+ return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+ asm volatile (
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx"
+ );
+ return num;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_mips.cc b/files/source/scale_mips.cc
new file mode 100644
index 0000000..ae95307
--- /dev/null
+++ b/files/source/scale_mips.cc
@@ -0,0 +1,644 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
+ "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
+ "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
+ "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* t = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst), [t] "+r" (t)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n"
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
+ "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
+ "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t5, 4(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 7 \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t1, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ const uint8* s2 = s1 + stride;
+ const uint8* s3 = s2 + stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 1 \n"
+ "andi $t8, %[dst_width], 1 \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
+ "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
+ "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
+ "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
+ "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
+ "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "add $t4, $t4, $t5 \n"
+ "add $t6, $t6, $t7 \n"
+ "add $t4, $t4, $t6 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "shra_r.w $t4, $t4, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[s3], %[s3], 8 \n"
+ "addiu $t9, $t9, -1 \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 2 \n"
+ "beqz $t8, 2f \n"
+ " nop \n"
+
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+
+ "2: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [s3] "+r" (s3)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
+ "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
+ "addiu %[dst_width], %[dst_width], -24 \n"
+ "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
+ "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
+ "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
+ "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
+ "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
+ "prepend $t1, $t2, 8 \n" // |4|3|1|0|
+ "prepend $t3, $t4, 24 \n" // |15|13|12|11|
+ "prepend $t5, $t6, 8 \n" // |20|19|17|16|
+ "prepend $t7, $t8, 24 \n" // |31|29|28|27|
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t3, 8(%[dst]) \n"
+ "sw $t5, 12(%[dst]) \n"
+ "sw $t9, 16(%[dst]) \n"
+ "sw $t7, 20(%[dst]) \n"
+ "bnez %[dst_width], 1b \n"
+ " addiu %[dst], %[dst], 24 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t3, 3 \n" // 0x00030003
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
+ "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t2, $t2, $t4 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "sll $t5, $t0, 1 \n"
+ "add $t0, $t5, $t0 \n"
+ "shra_r.ph $t2, $t2, 2 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shll.ph $t4, $t2, 1 \n"
+ "addq.ph $t4, $t4, $t2 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.w $t0, $t0, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "srl $t1, $t6, 16 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t2, 3 \n" // 0x00030003
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
+ "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t4, $t4, $t3 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shra_r.ph $t4, $t4, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.ph $t6, $t6, 1 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "srl $t1, $t6, 16 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t6, $t6 \n" // |26|27|24|25|
+ "srl $t0, $t0, 8 \n" // |X|2|3|0|
+ "srl $t3, $t3, 16 \n" // |X|X|15|14|
+ "srl $t5, $t5, 16 \n" // |X|X|23|22|
+ "srl $t7, $t7, 16 \n" // |X|X|31|30|
+ "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
+ "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
+ "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
+ "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
+ "prepend $t2, $t3, 24 \n" // |X|15|14|11|
+ "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
+ "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu %[dst_width], %[dst_width], -12 \n"
+ "addiu $t8,%[dst_width], -12 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t4, 4(%[dst]) \n"
+ "sw $t6, 8(%[dst]) \n"
+ "bgez $t8, 1b \n"
+ " addiu %[dst], %[dst], 12 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* t = src_ptr + stride;
+ const int c = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
+ "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
+ "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
+ "srl $t4, $t4, 2 \n" // t4 / 4
+ "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
+ "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
+ "addu $t6, $t5, $t6 \n"
+ "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
+ "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
+ "addu $t0, $t0, $t2 \n"
+ "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[t], %[t], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t4, -1(%[dst_ptr]) \n"
+ "sb $t6, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [t] "+r" (t),
+ [dst_width] "+r" (dst_width)
+ : [c] "r" (c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ stride += stride;
+ const uint8* s2 = src_ptr + stride;
+ const int c1 = 0x1C71;
+ const int c2 = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
+ "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
+ "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
+ "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
+ "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
+ "raddu.w.qb $t8, $t8 \n" // R5+R4
+ "addu $t7, $t7, $t8 \n"
+ "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
+ "raddu.w.qb $t8, $t8 \n" // R7 + R6
+ "addu $t6, $t6, $t8 \n"
+ "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
+ "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
+ "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
+ "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
+ "addu $t7, $t7, $t8 \n"
+ "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "raddu.w.qb $t4, $t4 \n"
+ "addu $t0, $t0, $t2 \n"
+ "addu $t0, $t0, $t4 \n"
+ "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t7, $t7, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t6, -1(%[dst_ptr]) \n"
+ "sb $t7, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [dst_width] "+r" (dst_width)
+ : [c1] "r" (c1), [c2] "r" (c2)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index a1946f0..44b0c80 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -4,11 +4,10 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -16,88 +15,120 @@
extern "C" {
#endif
-// This module is for GCC Neon
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
- "1: \n"
+ "1: \n"
// load even pixels into q0, odd into q1
- "vld2.u8 {q0,q1}, [%0]! \n"
- "vst1.u8 {q0}, [%1]! \n" // store even pixels
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
);
}
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #1 \n"
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
- "1: \n"
- "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
- "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ MEMACCESS(1)
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
- "vst1.u8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "1: \n"
- "vld2.u8 {d0, d1}, [%0]! \n"
- "vtrn.u8 d1, d0 \n"
- "vshrn.u16 d0, q0, #8 \n"
- "vst1.u32 {d0[1]}, [%1]! \n"
- "subs %2, #4 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
);
}
-void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
- "add r4, %0, %3 \n"
- "add r5, r4, %3 \n"
- "add %3, r5, %3 \n"
- "1: \n"
- "vld1.u8 {q0}, [%0]! \n" // load up 16x4
- "vld1.u8 {q1}, [r4]! \n"
- "vld1.u8 {q2}, [r5]! \n"
- "vld1.u8 {q3}, [%3]! \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ MEMACCESS(3)
+ "vld1.8 {q1}, [%3]! \n"
+ MEMACCESS(4)
+ "vld1.8 {q2}, [%4]! \n"
+ MEMACCESS(5)
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
@@ -105,14 +136,17 @@
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
- "vst1.u32 {d0[0]}, [%1]! \n"
- "subs %2, #4 \n"
+ MEMACCESS(1)
+ "vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(src_stride) // %3
- : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc"
);
}
@@ -120,32 +154,37 @@
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
- "subs %2, #24 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "d0", "d1", "d2", "d3", "memory", "cc"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
);
}
-void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
@@ -180,29 +219,31 @@
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "subs %2, #24 \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
-void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
-
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
@@ -220,72 +261,83 @@
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
-
- "subs %2, #24 \n"
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN38_NEON
-const uvec8 kShuf38 =
+static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uvec8 kShuf38_2 =
+static uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const vec16 kMult38_Div6 =
+static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const vec16 kMult38_Div9 =
+static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vld1.u8 {q3}, [%3] \n"
- "1: \n"
- "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.u8 {d4}, [%1]! \n"
- "vst1.u32 {d5[0]}, [%1]! \n"
- "subs %2, #12 \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ MEMACCESS(3)
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d4}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
asm volatile (
- "vld1.u16 {q13}, [%4] \n"
- "vld1.u8 {q14}, [%5] \n"
- "vld1.u8 {q15}, [%6] \n"
- "add r4, %0, %3, lsl #1 \n"
- "add %3, %0 \n"
- "1: \n"
+ MEMACCESS(5)
+ "vld1.16 {q13}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q14}, [%6] \n"
+ MEMACCESS(7)
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ "1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ MEMACCESS(4)
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -361,38 +413,44 @@
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.u8 {d3}, [%1]! \n"
- "vst1.u32 {d4[0]}, [%1]! \n"
- "subs %2, #12 \n"
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2), // %5
- "r"(&kMult38_Div9) // %6
- : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
- "q13", "q14", "q15", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
);
}
// 32x2 -> 12x1
-void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vld1.u16 {q13}, [%4] \n"
- "vld1.u8 {q14}, [%5] \n"
- "add %3, %0 \n"
- "1: \n"
+ MEMACCESS(4)
+ "vld1.16 {q13}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -457,78 +515,507 @@
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.u8 {d3}, [%1]! \n"
- "vst1.u32 {d4[0]}, [%1]! \n"
- "subs %2, #12 \n"
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ const uint8* src_tmp;
+ asm volatile (
+ "1: \n"
+ "mov %0, %1 \n"
+ "mov r12, %5 \n"
+ "veor q2, q2, q2 \n"
+ "veor q3, q3, q3 \n"
+ "2: \n"
+ // load 16 pixels into q0
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0], %3 \n"
+ "vaddw.u8 q3, q3, d1 \n"
+ "vaddw.u8 q2, q2, d0 \n"
+ "subs r12, r12, #1 \n"
+ "bgt 2b \n"
+ MEMACCESS(2)
+ "vst1.16 {q2, q3}, [%2]! \n" // store pixels
+ "add %1, %1, #16 \n"
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "=&r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
+ :
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
+
+// The NEON version mimics this formula:
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_ptr;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q1, q1, q0 \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ MEMACCESS(0)
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
- "beq 2f \n"
+ "beq 100f \n"
"add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
"cmp %4, #128 \n"
- "beq 3f \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
- "1: \n"
- "vld1.u8 {q0}, [%1]! \n"
- "vld1.u8 {q1}, [%2]! \n"
- "subs %3, #16 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
- "vst1.u8 {q0}, [%0]! \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
- "b 4f \n"
+ "b 99f \n"
- "2: \n"
- "vld1.u8 {q0}, [%1]! \n"
- "subs %3, #16 \n"
- "vst1.u8 {q0}, [%0]! \n"
- "bgt 2b \n"
- "b 4f \n"
-
- "3: \n"
- "vld1.u8 {q0}, [%1]! \n"
- "vld1.u8 {q1}, [%2]! \n"
- "subs %3, #16 \n"
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
- "vst1.u8 {q0}, [%0]! \n"
- "bgt 3b \n"
- "4: \n"
- "vst1.u8 {d1[7]}, [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
- :
- : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ MEMACCESS(0)
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
-#endif // __ARM_NEON__
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS(0)
+ "vld2.32 {q0, q1}, [%0]! \n"
+ MEMACCESS(0)
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ MEMACCESS(1)
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #1 \n"
+ "vrshrn.u16 d2, q2, #1 \n"
+ "vrshrn.u16 d3, q3, #1 \n"
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d4}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d5}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d6}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "vld1.32 {"#dn"["#n"]}, [%6] \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ int tmp;
+ const uint8* src_tmp = src_argb;
+ asm volatile (
+ "1: \n"
+ LOAD1_DATA32_LANE(d0, 0)
+ LOAD1_DATA32_LANE(d0, 1)
+ LOAD1_DATA32_LANE(d1, 0)
+ LOAD1_DATA32_LANE(d1, 1)
+ LOAD1_DATA32_LANE(d2, 0)
+ LOAD1_DATA32_LANE(d2, 1)
+ LOAD1_DATA32_LANE(d3, 0)
+ LOAD1_DATA32_LANE(d3, 1)
+
+ MEMACCESS(0)
+ "vst1.32 {q0, q1}, [%0]! \n" // store pixels
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "=&r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1"
+ );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_argb;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(d0, d2, 0)
+ LOAD2_DATA32_LANE(d0, d2, 1)
+ LOAD2_DATA32_LANE(d1, d3, 0)
+ LOAD2_DATA32_LANE(d1, d3, 1)
+ "vshrn.i32 d22, q8, #9 \n"
+ "vand.16 d22, d22, d30 \n"
+ "vdup.8 d24, d22[0] \n"
+ "vdup.8 d25, d22[2] \n"
+ "vdup.8 d26, d22[4] \n"
+ "vdup.8 d27, d22[6] \n"
+ "vext.8 d4, d24, d25, #4 \n"
+ "vext.8 d5, d26, d27, #4 \n" // f
+ "veor.8 q10, q2, q3 \n" // 0x7f ^ f
+ "vmull.u8 q11, d0, d20 \n"
+ "vmull.u8 q12, d1, d21 \n"
+ "vmull.u8 q13, d2, d4 \n"
+ "vmull.u8 q14, d3, d5 \n"
+ "vadd.i16 q11, q11, q13 \n"
+ "vadd.i16 q12, q12, q14 \n"
+ "vshrn.i16 d0, q11, #7 \n"
+ "vshrn.i16 d1, q12, #7 \n"
+
+ MEMACCESS(0)
+ "vst1.32 {d0, d1}, [%0]! \n" // store pixels
+ "vadd.s32 q8, q8, q9 \n"
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
new file mode 100644
index 0000000..ff277f2
--- /dev/null
+++ b/files/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ MEMACCESS(0)
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
+ "rshrn2 v0.16b, v1.8h, #1 \n"
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ MEMACCESS(1)
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ MEMACCESS(2)
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ MEMACCESS(3)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ MEMACCESS(4)
+ "ld1 {v2.16b}, [%3], #16 \n"
+ MEMACCESS(5)
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "uadalp v0.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n"
+ "uadalp v0.8h, v3.16b \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ MEMACCESS(1)
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
+ MEMACCESS(1)
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ MEMACCESS(3)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
+
+ // 3 * line_0 + line_1
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
+
+ MEMACCESS(1)
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+ "v20", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ MEMACCESS(3)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+ // average src line 0 with src line 1
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
+
+ MEMACCESS(1)
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+ );
+}
+
+static uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+ { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ MEMACCESS(3)
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ MEMACCESS(1)
+ "st1 {v2.8b}, [%1], #8 \n"
+ MEMACCESS(1)
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "v0", "v1", "v2", "v3", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+ ptrdiff_t tmp_src_stride = src_stride;
+
+ asm volatile (
+ MEMACCESS(5)
+ "ld1 {v29.8h}, [%5] \n"
+ MEMACCESS(6)
+ "ld1 {v30.16b}, [%6] \n"
+ MEMACCESS(7)
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ MEMACCESS(3)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ MEMACCESS(4)
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+
+ // combine source lines
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // combine source lines
+ "add v0.8h, v0.8h, v16.8h \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+
+ // 0+1+2, 3+4+5
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+ MEMACCESS(1)
+ "st1 {v3.8b}, [%1], #8 \n"
+ MEMACCESS(1)
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+ "v30", "v31", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ // TODO(fbarchard): use src_stride directly for clang 3.5+.
+ ptrdiff_t tmp_src_stride = src_stride;
+ asm volatile (
+ MEMACCESS(4)
+ "ld1 {v30.8h}, [%4] \n"
+ MEMACCESS(5)
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ MEMACCESS(3)
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+
+ // combine source lines
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "uqrshrn v2.8b, v2.8h, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+ // combine source lines
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+
+ // 0+1+2, 3+4+5
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+ MEMACCESS(1)
+ "st1 {v3.8b}, [%1], #8 \n"
+ MEMACCESS(1)
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+ "v18", "v19", "v30", "v31", "memory", "cc"
+ );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ const uint8* src_tmp;
+ asm volatile (
+ "1: \n"
+ "mov %0, %1 \n"
+ "mov w12, %w5 \n"
+ "eor v2.16b, v2.16b, v2.16b \n"
+ "eor v3.16b, v3.16b, v3.16b \n"
+ "2: \n"
+ // load 16 pixels into q0
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], %3 \n"
+ "uaddw2 v3.8h, v3.8h, v0.16b \n"
+ "uaddw v2.8h, v2.8h, v0.8b \n"
+ "subs w12, w12, #1 \n"
+ "b.gt 2b \n"
+ MEMACCESS(2)
+ "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
+ "add %1, %1, #16 \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "=&r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
+ :
+ : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld2 {v4.b, v5.b}["#n"], [%6] \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_ptr;
+ int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64) x;
+ int64 dx64 = (int64) dx;
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v1.4s, v1.4s, v0.4s \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ MEMACCESS(0)
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width64), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3",
+ "v4", "v5", "v6", "v7", "v16", "v17"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ int y_fraction = 256 - source_y_fraction;
+ asm volatile (
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ MEMACCESS(0)
+ "st1 {v0.b}[15], [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction),// %4
+ "+r"(y_fraction) // %5
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+ );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS (0)
+ "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
+ MEMACCESS (0)
+ "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ MEMACCESS (1)
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ MEMACCESS (1)
+ "st1 {v3.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (dst), // %1
+ "+r" (dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS (0)
+ // load 8 ARGB pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #1 \n"
+ "rshrn v2.8b, v2.8h, #1 \n"
+ "rshrn v3.8b, v3.8h, #1 \n"
+ MEMACCESS (1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ MEMACCESS (0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS (1)
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ MEMACCESS (2)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r" (src_ptr), // %0
+ "+r" (src_stride), // %1
+ "+r" (dst), // %2
+ "+r" (dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ MEMACCESS(0)
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((int64)(src_stepx * 4)) // %3
+ : "memory", "cc", "v0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "add %1, %1, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
+ "ld1 {v1.8b}, [%1], %4 \n"
+ MEMACCESS(0)
+ "ld1 {v2.8b}, [%0], %4 \n"
+ MEMACCESS(1)
+ "ld1 {v3.8b}, [%1], %4 \n"
+ MEMACCESS(0)
+ "ld1 {v4.8b}, [%0], %4 \n"
+ MEMACCESS(1)
+ "ld1 {v5.8b}, [%1], %4 \n"
+ MEMACCESS(0)
+ "ld1 {v6.8b}, [%0], %4 \n"
+ MEMACCESS(1)
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"((int64)(src_stepx * 4)) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld1 {"#vn".s}["#n"], [%6] \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint8* src_tmp = src_argb;
+ int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64) x;
+ int64 dx64 = (int64) dx;
+ int64 tmp64;
+ asm volatile (
+ "1: \n"
+ LOAD1_DATA32_LANE(v0, 0)
+ LOAD1_DATA32_LANE(v0, 1)
+ LOAD1_DATA32_LANE(v0, 2)
+ LOAD1_DATA32_LANE(v0, 3)
+ LOAD1_DATA32_LANE(v1, 0)
+ LOAD1_DATA32_LANE(v1, 1)
+ LOAD1_DATA32_LANE(v1, 2)
+ LOAD1_DATA32_LANE(v1, 3)
+
+ MEMACCESS(0)
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width64), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "=&r"(tmp64), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1"
+ );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8* src_tmp = src_argb;
+ int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64) x;
+ int64 dx64 = (int64) dx;
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(v0, v1, 0)
+ LOAD2_DATA32_LANE(v0, v1, 1)
+ LOAD2_DATA32_LANE(v0, v1, 2)
+ LOAD2_DATA32_LANE(v0, v1, 3)
+ "shrn v2.4h, v5.4s, #9 \n"
+ "and v2.8b, v2.8b, v4.8b \n"
+ "dup v16.8b, v2.b[0] \n"
+ "dup v17.8b, v2.b[2] \n"
+ "dup v18.8b, v2.b[4] \n"
+ "dup v19.8b, v2.b[6] \n"
+ "ext v2.8b, v16.8b, v17.8b, #4 \n"
+ "ext v17.8b, v18.8b, v19.8b, #4 \n"
+ "ins v2.d[1], v17.d[0] \n" // f
+ "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
+ "umull v16.8h, v0.8b, v7.8b \n"
+ "umull2 v17.8h, v0.16b, v7.16b \n"
+ "umull v18.8h, v1.8b, v2.8b \n"
+ "umull2 v19.8h, v1.16b, v2.16b \n"
+ "add v16.8h, v16.8h, v18.8h \n"
+ "add v17.8h, v17.8h, v19.8h \n"
+ "shrn v0.8b, v16.8h, #7 \n"
+ "shrn2 v0.16b, v17.8h, #7 \n"
+
+ MEMACCESS(0)
+ "st1 {v0.4s}, [%0], #16 \n" // store pixels
+ "add v5.4s, v5.4s, v6.4s \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width64), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v16", "v17", "v18", "v19"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
new file mode 100644
index 0000000..f170973
--- /dev/null
+++ b/files/source/scale_win.cc
@@ -0,0 +1,1374 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add
+ paddw xmm1, xmm3
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add
+ vpaddw ymm1, ymm1, ymm3
+ vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
+ vpsrlw ymm1, ymm1, 1
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ movdqa xmm5, xmm4
+ packuswb xmm4, xmm4
+ psllw xmm5, 3 // constant 0x0008
+
+ wloop:
+ movdqu xmm0, [eax] // average rows
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add rows 0, 1
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + esi * 2]
+ movdqu xmm3, [eax + esi * 2 + 16]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 2
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + edi]
+ movdqu xmm3, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 3
+ paddw xmm1, xmm3
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // + 8 for round
+ psrlw xmm0, 4 // /16 for average of 4 * 4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
+ vpsrld ymm5, ymm5, 24
+ vpslld ymm5, ymm5, 16
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
+ vpsrlw ymm4, ymm4, 15
+ vpsllw ymm5, ymm4, 3 // constant 0x0008
+ vpackuswb ymm4, ymm4, ymm4
+
+ wloop:
+ vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + esi * 2]
+ vmovdqu ymm3, [eax + esi * 2 + 32]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 2
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + edi]
+ vmovdqu ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 3
+ vpaddw ymm1, ymm1, ymm3
+ vphaddw ymm0, ymm0, ymm1 // mutates
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, xmmword ptr kShuf0
+ movdqa xmm4, xmmword ptr kShuf1
+ movdqa xmm5, xmmword ptr kShuf2
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, xmmword ptr kShuf38a
+ movdqa xmm5, xmmword ptr kShuf38b
+
+ xloop:
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ sub ecx, 12
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAc
+ movdqa xmm3, xmmword ptr kShufAc3
+ movdqa xmm4, xmmword ptr kScaleAc33
+ pxor xmm5, xmm5
+
+ xloop:
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqu xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAb0
+ movdqa xmm3, xmmword ptr kShufAb1
+ movdqa xmm4, xmmword ptr kShufAb2
+ movdqa xmm5, xmmword ptr kScaleAb2
+
+ xloop:
+ movdqu xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm1, [eax + esi]
+ lea eax, [eax + 16]
+ pavgb xmm0, xmm1
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ pxor xmm5, xmm5
+
+ // sum rows
+ xloop:
+ movdqu xmm3, [eax] // read 16 bytes
+ lea eax, [eax + 16]
+ movdqu xmm0, [edx] // read 16 words from destination
+ movdqu xmm1, [edx + 16]
+ movdqa xmm2, xmm3
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ movdqu [edx], xmm0 // write 16 words to destination
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 16
+ jg xloop
+ ret
+ }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ vpxor ymm5, ymm5, ymm5
+
+ // sum rows
+ xloop:
+ vmovdqu ymm3, [eax] // read 32 bytes
+ lea eax, [eax + 32]
+ vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
+ vpunpcklbw ymm2, ymm3, ymm5
+ vpunpckhbw ymm3, ymm3, ymm5
+ vpaddusw ymm0, ymm2, [edx] // sum 16 words
+ vpaddusw ymm1, ymm3, [edx + 32]
+ vmovdqu [edx], ymm0 // write 32 words to destination
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 32
+ jg xloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static uvec8 kFsub80 =
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static uvec16 kFadd40 =
+ { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
+ movd xmm2, [esp + 12 + 16] // x
+ movd xmm3, [esp + 12 + 20] // dx
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ movd xmm5, eax
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pcmpeqb xmm7, xmm7 // generate 0x0001
+ psrlw xmm7, 15
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm1, 9 // 7 bit fractions.
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
+ movd xmm4, ebx
+ pshufb xmm1, xmm5 // 0011
+ punpcklwd xmm0, xmm4
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm1, xmm1 // 8 bits, 2 pixels.
+ movd ebx, xmm1
+ mov [edi], bx
+ lea edi, [edi + 2]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm2, xmm0 // 16 bit
+ paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm2, xmm2 // 8 bits
+ movd ebx, xmm2
+ mov [edi], bl
+
+ xloop99:
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ ret
+ }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push edi
+ push esi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ paddd xmm2, xmm0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
+
+ cmp ecx, 0
+ jle xloop99
+ sub ecx, 4
+ jl xloop49
+
+ // 4 Pixel loop.
+ xloop4:
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4 // 4 pixels
+ jge xloop4
+
+ xloop49:
+ test ecx, 2
+ je xloop29
+
+ // 2 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+
+ xloop29:
+ test ecx, 1
+ je xloop99
+
+ // 1 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels
+ movd dword ptr [edi], xmm0
+ xloop99:
+
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, xmmword ptr kShuffleColARGB
+ movdqa xmm5, xmmword ptr kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm0
+ punpckhdq xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ idiv dword ptr [esp + 8]
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ mov ecx, [esp + 8] // denom
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ sub eax, 0x00010001
+ sbb edx, 0
+ sub ecx, 1
+ idiv ecx
+ ret
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 616affd..00fb71e 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -4,7 +4,7 @@
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -16,30 +16,40 @@
extern "C" {
#endif
-#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
struct FourCCAliasEntry {
uint32 alias;
uint32 canonical;
};
-static const FourCCAliasEntry kFourCCAliases[] = {
+static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
- {FOURCC_YUVS, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
- {FOURCC_2VUY, FOURCC_UYVY},
- {FOURCC_BA81, FOURCC_BGGR},
+ {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
- {FOURCC_RGB3, FOURCC_RAW},
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
+ {FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
+ {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
+ {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
+ {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
+ {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
LIBYUV_API
uint32 CanonicalFourCC(uint32 fourcc) {
- for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
if (kFourCCAliases[i].alias == fourcc) {
return kFourCCAliases[i].canonical;
}