p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007
git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 1a8f4fb..122b309 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -112,6 +112,7 @@
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
+ ".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
@@ -141,6 +142,7 @@
asm volatile (
"movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@@ -164,6 +166,7 @@
asm volatile (
"movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@@ -187,6 +190,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -227,6 +231,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -279,6 +284,7 @@
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -327,6 +333,7 @@
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -372,6 +379,7 @@
"pslld $0x4,%%xmm5 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -405,6 +413,7 @@
void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -445,6 +454,7 @@
void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -491,6 +501,7 @@
"pslld $0x5,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -531,6 +542,7 @@
"pslld $0xa,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n"
"pslld $0xf,%%xmm7 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -570,6 +582,7 @@
"psllw $0xc,%%xmm4 \n"
"movdqa %%xmm4,%%xmm3 \n"
"psrlw $0x8,%%xmm3 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -599,6 +612,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -635,6 +649,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -689,6 +704,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -753,6 +769,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -808,6 +825,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -844,6 +862,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -893,6 +912,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -957,6 +977,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1012,6 +1033,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1048,6 +1070,7 @@
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1097,6 +1120,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1161,6 +1185,7 @@
);
asm volatile (
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1295,6 +1320,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
@@ -1329,6 +1355,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -1364,6 +1391,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
@@ -1398,6 +1426,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
@@ -1432,6 +1461,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -1467,6 +1497,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
YUVTORGB
"punpcklbw %%xmm1,%%xmm2 \n"
@@ -1501,6 +1532,7 @@
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movd (%1),%%xmm0 \n"
"movd (%1,%2,1),%%xmm1 \n"
@@ -1562,6 +1594,7 @@
"mov $0x012a012a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
+ ".p2align 4 \n"
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq (%0),%%xmm0 \n"
@@ -1607,6 +1640,7 @@
asm volatile (
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0,%2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
@@ -1631,6 +1665,7 @@
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0,%2),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -1668,6 +1703,7 @@
"movdqa %4,%%xmm1 \n"
"lea -16(%0,%3,2),%0 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n"
@@ -1695,6 +1731,7 @@
void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
@@ -1725,6 +1762,7 @@
void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
@@ -1758,6 +1796,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1833,6 +1872,7 @@
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1861,6 +1901,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1900,6 +1941,7 @@
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1930,6 +1972,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1965,6 +2008,7 @@
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -1993,6 +2037,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
@@ -2029,6 +2074,7 @@
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile (
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -2057,6 +2103,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -2109,6 +2156,7 @@
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm3 \n"
"movdqa %%xmm3,%%xmm0 \n"
@@ -2184,6 +2232,7 @@
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@@ -2241,6 +2290,7 @@
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm3 \n"
"movdqa %%xmm3,%%xmm0 \n"
@@ -2313,6 +2363,7 @@
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@@ -2361,6 +2412,7 @@
"psrld $0x8,%%xmm5 \n"
// 4 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -2415,6 +2467,7 @@
"movdqa %4,%%xmm5 \n"
// 4 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
@@ -2503,6 +2556,7 @@
"pslld $0x18,%%xmm4 \n"
// 4 pixel loop
+ ".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movzb 0x3(%0),%3 \n"