| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| %include "vpx_ports/x86_abi_support.asm" |
| |
| %macro PROCESS_16X2X8 1 |
| %if %1 |
| movdqa xmm0, XMMWORD PTR [rsi] |
| movq xmm1, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| movq xmm2, MMWORD PTR [rdi+16] |
| punpcklqdq xmm1, xmm3 |
| punpcklqdq xmm3, xmm2 |
| |
| movdqa xmm2, xmm1 |
| mpsadbw xmm1, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| |
| psrldq xmm0, 8 |
| |
| movdqa xmm4, xmm3 |
| mpsadbw xmm3, xmm0, 0x0 |
| mpsadbw xmm4, xmm0, 0x5 |
| |
| paddw xmm1, xmm2 |
| paddw xmm1, xmm3 |
| paddw xmm1, xmm4 |
| %else |
| movdqa xmm0, XMMWORD PTR [rsi] |
| movq xmm5, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| movq xmm2, MMWORD PTR [rdi+16] |
| punpcklqdq xmm5, xmm3 |
| punpcklqdq xmm3, xmm2 |
| |
| movdqa xmm2, xmm5 |
| mpsadbw xmm5, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| |
| psrldq xmm0, 8 |
| |
| movdqa xmm4, xmm3 |
| mpsadbw xmm3, xmm0, 0x0 |
| mpsadbw xmm4, xmm0, 0x5 |
| |
| paddw xmm5, xmm2 |
| paddw xmm5, xmm3 |
| paddw xmm5, xmm4 |
| |
| paddw xmm1, xmm5 |
| %endif |
| movdqa xmm0, XMMWORD PTR [rsi + rax] |
| movq xmm5, MMWORD PTR [rdi+ rdx] |
| movq xmm3, MMWORD PTR [rdi+ rdx+8] |
| movq xmm2, MMWORD PTR [rdi+ rdx+16] |
| punpcklqdq xmm5, xmm3 |
| punpcklqdq xmm3, xmm2 |
| |
| lea rsi, [rsi+rax*2] |
| lea rdi, [rdi+rdx*2] |
| |
| movdqa xmm2, xmm5 |
| mpsadbw xmm5, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| |
| psrldq xmm0, 8 |
| movdqa xmm4, xmm3 |
| mpsadbw xmm3, xmm0, 0x0 |
| mpsadbw xmm4, xmm0, 0x5 |
| |
| paddw xmm5, xmm2 |
| paddw xmm5, xmm3 |
| paddw xmm5, xmm4 |
| |
| paddw xmm1, xmm5 |
| %endmacro |
| |
| %macro PROCESS_8X2X8 1 |
| %if %1 |
| movq xmm0, MMWORD PTR [rsi] |
| movq xmm1, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| punpcklqdq xmm1, xmm3 |
| |
| movdqa xmm2, xmm1 |
| mpsadbw xmm1, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| paddw xmm1, xmm2 |
| %else |
| movq xmm0, MMWORD PTR [rsi] |
| movq xmm5, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| punpcklqdq xmm5, xmm3 |
| |
| movdqa xmm2, xmm5 |
| mpsadbw xmm5, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| paddw xmm5, xmm2 |
| |
| paddw xmm1, xmm5 |
| %endif |
| movq xmm0, MMWORD PTR [rsi + rax] |
| movq xmm5, MMWORD PTR [rdi+ rdx] |
| movq xmm3, MMWORD PTR [rdi+ rdx+8] |
| punpcklqdq xmm5, xmm3 |
| |
| lea rsi, [rsi+rax*2] |
| lea rdi, [rdi+rdx*2] |
| |
| movdqa xmm2, xmm5 |
| mpsadbw xmm5, xmm0, 0x0 |
| mpsadbw xmm2, xmm0, 0x5 |
| paddw xmm5, xmm2 |
| |
| paddw xmm1, xmm5 |
| %endmacro |
| |
| %macro PROCESS_4X2X8 1 |
| %if %1 |
| movd xmm0, [rsi] |
| movq xmm1, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| punpcklqdq xmm1, xmm3 |
| |
| mpsadbw xmm1, xmm0, 0x0 |
| %else |
| movd xmm0, [rsi] |
| movq xmm5, MMWORD PTR [rdi] |
| movq xmm3, MMWORD PTR [rdi+8] |
| punpcklqdq xmm5, xmm3 |
| |
| mpsadbw xmm5, xmm0, 0x0 |
| |
| paddw xmm1, xmm5 |
| %endif |
| movd xmm0, [rsi + rax] |
| movq xmm5, MMWORD PTR [rdi+ rdx] |
| movq xmm3, MMWORD PTR [rdi+ rdx+8] |
| punpcklqdq xmm5, xmm3 |
| |
| lea rsi, [rsi+rax*2] |
| lea rdi, [rdi+rdx*2] |
| |
| mpsadbw xmm5, xmm0, 0x0 |
| |
| paddw xmm1, xmm5 |
| %endmacro |
| |
| |
| ;void vp8_sad16x16x8_sse4( |
| ; const unsigned char *src_ptr, |
| ; int src_stride, |
| ; const unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned short *sad_array); |
| global sym(vp8_sad16x16x8_sse4) |
| sym(vp8_sad16x16x8_sse4): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| PROCESS_16X2X8 1 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| |
| mov rdi, arg(4) ;Results |
| movdqa XMMWORD PTR [rdi], xmm1 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;void vp8_sad16x8x8_sse4( |
| ; const unsigned char *src_ptr, |
| ; int src_stride, |
| ; const unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned short *sad_array |
| ;); |
| global sym(vp8_sad16x8x8_sse4) |
| sym(vp8_sad16x8x8_sse4): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| PROCESS_16X2X8 1 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| PROCESS_16X2X8 0 |
| |
| mov rdi, arg(4) ;Results |
| movdqa XMMWORD PTR [rdi], xmm1 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;void vp8_sad8x8x8_sse4( |
| ; const unsigned char *src_ptr, |
| ; int src_stride, |
| ; const unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned short *sad_array |
| ;); |
| global sym(vp8_sad8x8x8_sse4) |
| sym(vp8_sad8x8x8_sse4): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| PROCESS_8X2X8 1 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| |
| mov rdi, arg(4) ;Results |
| movdqa XMMWORD PTR [rdi], xmm1 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;void vp8_sad8x16x8_sse4( |
| ; const unsigned char *src_ptr, |
| ; int src_stride, |
| ; const unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned short *sad_array |
| ;); |
| global sym(vp8_sad8x16x8_sse4) |
| sym(vp8_sad8x16x8_sse4): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| PROCESS_8X2X8 1 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| PROCESS_8X2X8 0 |
| mov rdi, arg(4) ;Results |
| movdqa XMMWORD PTR [rdi], xmm1 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| ;void vp8_sad4x4x8_c( |
| ; const unsigned char *src_ptr, |
| ; int src_stride, |
| ; const unsigned char *ref_ptr, |
| ; int ref_stride, |
| ; unsigned short *sad_array |
| ;); |
| global sym(vp8_sad4x4x8_sse4) |
| sym(vp8_sad4x4x8_sse4): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 5 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;src_ptr |
| mov rdi, arg(2) ;ref_ptr |
| |
| movsxd rax, dword ptr arg(1) ;src_stride |
| movsxd rdx, dword ptr arg(3) ;ref_stride |
| |
| PROCESS_4X2X8 1 |
| PROCESS_4X2X8 0 |
| |
| mov rdi, arg(4) ;Results |
| movdqa XMMWORD PTR [rdi], xmm1 |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| |
| |
| |