port yuv to rgb to mac
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/269017
git-svn-id: http://libyuv.googlecode.com/svn/trunk@83 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 2eb5fc3..83a92ba 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -319,13 +319,6 @@
#endif
-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
-#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-
-vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-#endif
-
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
@@ -340,22 +333,7 @@
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
-vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-
-vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
#if defined(__APPLE__) || defined(__x86_64__)
#define OMITFP
@@ -363,7 +341,27 @@
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif
-// This version produces 8 pixels
+struct {
+ vec8 kUVToB;
+ vec8 kUVToG;
+ vec8 kUVToR;
+ vec16 kUVBiasB;
+ vec16 kUVBiasG;
+ vec16 kUVBiasR;
+ vec16 kYSub16;
+ vec16 kYToRgb;
+} SIMD_ALIGNED(kYuvConstants) = {
+ { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR },
+ { 16, 16, 16, 16, 16, 16, 16, 16 },
+ { YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// Convert 8 pixels
#define YUVTORGB \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
@@ -372,17 +370,17 @@
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw %5,%%xmm0 \n" \
- "pmaddubsw %6,%%xmm1 \n" \
- "pmaddubsw %7,%%xmm2 \n" \
- "psubw %8,%%xmm0 \n" \
- "psubw %9,%%xmm1 \n" \
- "psubw %10,%%xmm2 \n" \
+ "pmaddubsw (%5),%%xmm0 \n" \
+ "pmaddubsw 16(%5),%%xmm1 \n" \
+ "pmaddubsw 32(%5),%%xmm2 \n" \
+ "psubw 48(%5),%%xmm0 \n" \
+ "psubw 64(%5),%%xmm1 \n" \
+ "psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw %11,%%xmm3 \n" \
- "pmullw %12,%%xmm3 \n" \
+ "psubsw 96(%5),%%xmm3 \n" \
+ "pmullw 112(%5),%%xmm3 \n" \
"paddw %%xmm3,%%xmm0 \n" \
"paddw %%xmm3,%%xmm1 \n" \
"paddw %%xmm3,%%xmm2 \n" \
@@ -420,14 +418,7 @@
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
- : "m" (kUVToB), // %5
- "m" (kUVToG), // %6
- "m" (kUVToR), // %7
- "m" (kUVBiasB), // %8
- "m" (kUVBiasG), // %9
- "m" (kUVBiasR), // %10
- "m" (kYSub16), // %11
- "m" (kYToRgb) // %12
+ : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -463,14 +454,7 @@
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
- : "m" (kUVToB), // %5
- "m" (kUVToG), // %6
- "m" (kUVToR), // %7
- "m" (kUVBiasB), // %8
- "m" (kUVBiasG), // %9
- "m" (kUVBiasR), // %10
- "m" (kYSub16), // %11
- "m" (kYToRgb) // %12
+ : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -508,14 +492,7 @@
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
- : "m" (kUVToB), // %5
- "m" (kUVToG), // %6
- "m" (kUVToR), // %7
- "m" (kUVBiasB), // %8
- "m" (kUVBiasG), // %9
- "m" (kUVBiasR), // %10
- "m" (kYSub16), // %11
- "m" (kYToRgb) // %12
+ : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -540,17 +517,17 @@
"punpcklbw %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
- "pmaddubsw %5,%%xmm0 \n"
- "pmaddubsw %6,%%xmm1 \n"
- "pmaddubsw %7,%%xmm2 \n"
- "psubw %8,%%xmm0 \n"
- "psubw %9,%%xmm1 \n"
- "psubw %10,%%xmm2 \n"
+ "pmaddubsw (%5),%%xmm0 \n"
+ "pmaddubsw 16(%5),%%xmm1 \n"
+ "pmaddubsw 32(%5),%%xmm2 \n"
+ "psubw 48(%5),%%xmm0 \n"
+ "psubw 64(%5),%%xmm1 \n"
+ "psubw 80(%5),%%xmm2 \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm3 \n"
- "psubsw %11,%%xmm3 \n"
- "pmullw %12,%%xmm3 \n"
+ "psubsw 96(%5),%%xmm3 \n"
+ "pmullw 112(%5),%%xmm3 \n"
"paddw %%xmm3,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm2 \n"
@@ -572,14 +549,7 @@
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+rm"(width) // %4
- : "m" (kUVToB), // %5
- "m" (kUVToG), // %6
- "m" (kUVToR), // %7
- "m" (kUVBiasB), // %8
- "m" (kUVBiasG), // %9
- "m" (kUVBiasR), // %10
- "m" (kYSub16), // %11
- "m" (kYToRgb) // %12
+ : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -625,8 +595,8 @@
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
- : "m" (kYSub16), // %3
- "m" (kYToRgb) // %4
+ : "m"(kYuvConstants.kYSub16), // %3
+ "m"(kYuvConstants.kYToRgb) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"