I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/322013
git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index fad20b2..eadde78 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -558,47 +558,49 @@
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm2 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "mov $0x10001000,%%eax \n"
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "mov $0x012a012a,%%eax \n"
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
"1: \n"
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "psubsw %%xmm3,%%xmm0 \n"
- "pmullw %%xmm2,%%xmm0 \n"
- "psraw $0x6,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
- // Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm1,16(%1) \n"
- "lea 32(%1),%1 \n"
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,16(%1) \n"
+ "lea 32(%1),%1 \n"
- "sub $0x8,%2 \n"
- "ja 1b \n"
+ "sub $0x8,%2 \n"
+ "ja 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
- : "m"(kYuvConstants.kYSub16), // %3
- "m"(kYuvConstants.kYToRgb) // %4
- : "memory", "cc"
+ :
+ : "memory", "cc", "eax"
#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}