Fixed non-fatal out-of-bounds read in SSE2 SIMD code reported by valgrind when decompressing a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@727 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/ChangeLog.txt b/ChangeLog.txt
index ddd7bd5..be091dd 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -5,6 +5,12 @@
 was not adding the current directory to the assembler include path, so YASM
 was not able to find jsimdcfg.inc.)
 
+[2] Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind.  See http://crbug.com/72399 for more information.
+
 
 1.1.90 (1.2 beta1)
 ==================
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
index fdb33a3..696a383 100644
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm
@@ -290,6 +290,41 @@
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store the lower 8 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_MMWORD
+	jb	short .column_st7
+	movq	MMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_MMWORD
+	sub	rcx, byte SIZEOF_MMWORD
+	psrldq	xmmA, SIZEOF_MMWORD
+.column_st7:
+	; Store the lower 4 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_DWORD
+	jb	short .column_st3
+	movd	DWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_DWORD
+	sub	rcx, byte SIZEOF_DWORD
+	psrldq	xmmA, SIZEOF_DWORD
+.column_st3:
+	; Store the lower 2 bytes of rax to the output when it has enough
+	; space.
+	movd	eax, xmmA
+	cmp	rcx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [rdi], ax
+	add	rdi, byte SIZEOF_WORD
+	sub	rcx, byte SIZEOF_WORD
+	shr	rax, 16
+.column_st1:
+	; Store the lower 1 byte of rax to the output when it has enough
+	; space.
+	test	rcx, rcx
+	jz	short .nextrow
+	mov	BYTE [rdi], al
+%else
 	mov	rax,rcx
 	xor	rcx, byte 0x0F
 	shl	rcx, 2
@@ -329,6 +364,7 @@
 	por	xmmE,xmmC
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -413,6 +449,22 @@
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store two pixels (8 bytes) of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_XMMWORD/8
+	jb	short .column_st7
+	movq	MMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD/8*4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	psrldq	xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+	; Store one pixel (4 bytes) of xmmA to the output when it has enough
+	; space.
+	test	rcx, rcx
+	jz	short .nextrow
+	movd	DWORD [rdi], xmmA
+%else
 	cmp	rcx, byte SIZEOF_XMMWORD/16
 	jb	near .nextrow
 	mov	rax,rcx
@@ -452,6 +504,7 @@
 	por	xmmE,xmmG
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
index 3059d7d..7f519e6 100644
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm
@@ -302,6 +302,41 @@
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store the lower 8 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st7
+	movq	MMWORD [edi], xmmA
+	add	edi, byte SIZEOF_MMWORD
+	sub	ecx, byte SIZEOF_MMWORD
+	psrldq	xmmA, SIZEOF_MMWORD
+.column_st7:
+	; Store the lower 4 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st3
+	movd	DWORD [edi], xmmA
+	add	edi, byte SIZEOF_DWORD
+	sub	ecx, byte SIZEOF_DWORD
+	psrldq	xmmA, SIZEOF_DWORD
+.column_st3:
+	; Store the lower 2 bytes of eax to the output when it has enough
+	; space.
+	movd	eax, xmmA
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi], ax
+	add	edi, byte SIZEOF_WORD
+	sub	ecx, byte SIZEOF_WORD
+	shr	eax, 16
+.column_st1:
+	; Store the lower 1 byte of eax to the output when it has enough
+	; space.
+	test	ecx, ecx
+	jz	short .nextrow
+	mov	BYTE [edi], al
+%else
 	mov	eax,ecx
 	xor	ecx, byte 0x0F
 	shl	ecx, 2
@@ -341,6 +376,7 @@
 	por	xmmE,xmmC
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -426,6 +462,22 @@
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store two pixels (8 bytes) of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_XMMWORD/8
+	jb	short .column_st7
+	movq	MMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD/8*4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	psrldq	xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+	; Store one pixel (4 bytes) of xmmA to the output when it has enough
+	; space.
+	test	ecx, ecx
+	jz	short .nextrow
+	movd	DWORD [edi], xmmA
+%else
 	cmp	ecx, byte SIZEOF_XMMWORD/16
 	jb	short .nextrow
 	mov	eax,ecx
@@ -465,6 +517,7 @@
 	por	xmmE,xmmG
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index 0c2503f..a64a6b3 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -294,6 +294,41 @@
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store the lower 8 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_MMWORD
+	jb	short .column_st7
+	movq	MMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_MMWORD
+	sub	rcx, byte SIZEOF_MMWORD
+	psrldq	xmmA, SIZEOF_MMWORD
+.column_st7:
+	; Store the lower 4 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_DWORD
+	jb	short .column_st3
+	movd	DWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_DWORD
+	sub	rcx, byte SIZEOF_DWORD
+	psrldq	xmmA, SIZEOF_DWORD
+.column_st3:
+	; Store the lower 2 bytes of rax to the output when it has enough
+	; space.
+	movd	eax, xmmA
+	cmp	rcx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [rdi], ax
+	add	rdi, byte SIZEOF_WORD
+	sub	rcx, byte SIZEOF_WORD
+	shr	rax, 16
+.column_st1:
+	; Store the lower 1 byte of rax to the output when it has enough
+	; space.
+	test	rcx, rcx
+	jz	short .endcolumn
+	mov	BYTE [rdi], al
+%else
 	mov	rax,rcx
 	xor	rcx, byte 0x0F
 	shl	rcx, 2
@@ -333,6 +368,7 @@
 	por	xmmE,xmmC
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -420,6 +456,22 @@
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store two pixels (8 bytes) of xmmA to the output when it has enough
+	; space.
+	cmp	rcx, byte SIZEOF_XMMWORD/8
+	jb	short .column_st7
+	movq	MMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD/8*4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	psrldq	xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+	; Store one pixel (4 bytes) of xmmA to the output when it has enough
+	; space.
+	test	rcx, rcx
+	jz	short .endcolumn
+	movd	DWORD [rdi], xmmA
+%else
 	cmp	rcx, byte SIZEOF_XMMWORD/16
 	jb	near .endcolumn
 	mov	rax,rcx
@@ -459,6 +511,7 @@
 	por	xmmE,xmmG
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index 368ac3c..04089aa 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -307,6 +307,41 @@
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store the lower 8 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st7
+	movq	MMWORD [edi], xmmA
+	add	edi, byte SIZEOF_MMWORD
+	sub	ecx, byte SIZEOF_MMWORD
+	psrldq	xmmA, SIZEOF_MMWORD
+.column_st7:
+	; Store the lower 4 bytes of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st3
+	movd	DWORD [edi], xmmA
+	add	edi, byte SIZEOF_DWORD
+	sub	ecx, byte SIZEOF_DWORD
+	psrldq	xmmA, SIZEOF_DWORD
+.column_st3:
+	; Store the lower 2 bytes of eax to the output when it has enough
+	; space.
+	movd	eax, xmmA
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi], ax
+	add	edi, byte SIZEOF_WORD
+	sub	ecx, byte SIZEOF_WORD
+	shr	eax, 16
+.column_st1:
+	; Store the lower 1 byte of eax to the output when it has enough
+	; space.
+	test	ecx, ecx
+	jz	short .endcolumn
+	mov	BYTE [edi], al
+%else
 	mov	eax,ecx
 	xor	ecx, byte 0x0F
 	shl	ecx, 2
@@ -346,6 +381,7 @@
 	por	xmmE,xmmC
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -434,6 +470,22 @@
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+	; Store two pixels (8 bytes) of xmmA to the output when it has enough
+	; space.
+	cmp	ecx, byte SIZEOF_XMMWORD/8
+	jb	short .column_st7
+	movq	MMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD/2
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	psrldq	xmmA, 64
+.column_st7:
+	; Store one pixel (4 bytes) of xmmA to the output when it has enough
+	; space.
+	test	ecx, ecx
+	jz	short .endcolumn
+	movd	DWORD [edi], xmmA
+%else
 	cmp	ecx, byte SIZEOF_XMMWORD/16
 	jb	short .endcolumn
 	mov	eax,ecx
@@ -473,6 +525,7 @@
 	por	xmmE,xmmG
 .adj0:	; ----------------
 	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index 635a931..4ab9bc0 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -86,6 +86,8 @@
 %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
 %endif
 
+%define STRICT_MEMORY_ACCESS 1
+
 ; To make the code position-independent, append -DPIC to the commandline
 ;
 %define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC