4 pixel version of affine for gcc and aligned version of win.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714007
git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index ee1dbc0..b70fcd0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3220,61 +3220,91 @@
#endif // HAS_ARGBSHADE_SSE2
#ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination.
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp = 0;
asm volatile (
"movq (%3),%%xmm2 \n"
- "movq 0x8(%3),%%xmm3 \n"
+ "movq 0x8(%3),%%xmm7 \n"
"shl $0x10,%1 \n"
"add $0x4,%1 \n"
- "movd %1,%%xmm4 \n"
- "xor %1,%1 \n" // cleanse upper bits.
- "sub $0x2,%4 \n"
- "jl 29f \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
"movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm3,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
"movlhps %%xmm0,%%xmm2 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movlhps %%xmm3,%%xmm3 \n"
- "addps %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
- // 2 pixel loop \n"
- ".p2align 2 \n"
- "20: \n"
- "cvttps2dq %%xmm2,%%xmm1 \n"
- "packssdw %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm4,%%xmm1 \n"
- "addps %%xmm3,%%xmm2 \n"
- "movd %%xmm1,%1 \n"
+ // 4 pixel loop \n"
+ ".p2align 4 \n"
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "cvttps2dq %%xmm3,%%xmm1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+#if defined(__x86_64__)
+ "movq %%xmm0,%1 \n"
+ "mov %1,%5 \n"
"and $0x0fffffff,%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "pshufd $0x55,%%xmm5,%%xmm5 \n"
+ "shr $32,%5 \n"
+ "pshufd $0xEE,%%xmm0,%%xmm0 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+#endif
+ "movd (%0,%1,1),%%xmm1 \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+#if defined(__x86_64__)
+ "movq %%xmm0,%1 \n"
+ "mov %1,%5 \n"
+ "and $0x0fffffff,%1 \n"
+ "shr $32,%5 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+#endif
"movd (%0,%1,1),%%xmm0 \n"
- "movd %%xmm5,%1 \n"
- "and $0x0fffffff,%1 \n"
- "movd (%0,%1,1),%%xmm5 \n"
- "punpckldq %%xmm5,%%xmm0 \n"
- "sub $0x2,%4 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
- "jge 20b \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "sub $0x4,%4 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
- "29: \n"
- "add $0x1,%4 \n"
+ "49: \n"
+ "add $0x3,%4 \n"
"jl 19f \n"
// 1 pixel loop \n"
- ".p2align 2 \n"
+ ".p2align 4 \n"
"10: \n"
- "cvttps2dq %%xmm2,%%xmm1 \n"
- "packssdw %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm4,%%xmm1 \n"
- "addps %%xmm3,%%xmm2 \n"
- "movd %%xmm1,%1 \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%1 \n"
+#if defined(__x86_64__)
"and $0x0fffffff,%1 \n"
+#endif
"movd (%0,%1,1),%%xmm0 \n"
"sub $0x1,%4 \n"
"movd %%xmm0,(%2) \n"
@@ -3285,11 +3315,12 @@
"+r"(src_argb_stride_temp), // %1
"+r"(dst_argb), // %2
"+r"(uv_dudv), // %3
- "+rm"(width) // %4
+ "+rm"(width), // %4
+ "+r"(temp) // %5
:
: "memory", "cc"
#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}