Implement RGB565 multisample resolve.

Bug 20891368

Change-Id: I29054ccc0a91fdc41d26d26dd4f55dfd4dfca7e4
Reviewed-on: https://swiftshader-review.googlesource.com/3952
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index 0919066..79e4c0b 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -5520,6 +5520,407 @@
 				else ASSERT(false);
 			}
 		}
+		else if(internal.format == FORMAT_R5G6B5)
+		{
+			if(CPUID::supportsSSE2() && (width % 8) == 0)
+			{
+				if(internal.depth == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x += 8)
+						{
+							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+						
+							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+
+							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							c1 = _mm_avg_epu16(c0__g_, c1__g_);
+							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							c0 = _mm_or_si128(c0, c1);
+
+							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.depth == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x += 8)
+						{
+							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+							
+							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+
+							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+							c0 = _mm_avg_epu8(c0, c2);
+							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							c1 = _mm_avg_epu16(c0__g_, c1__g_);
+							c3 = _mm_avg_epu16(c2__g_, c3__g_);
+							c1 = _mm_avg_epu16(c1, c3);
+							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							c0 = _mm_or_si128(c0, c1);
+
+							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.depth == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x += 8)
+						{
+							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+							
+							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+
+							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+							c0 = _mm_avg_epu8(c0, c2);
+							c4 = _mm_avg_epu8(c4, c6);
+							c0 = _mm_avg_epu8(c0, c4);
+							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							c1 = _mm_avg_epu16(c0__g_, c1__g_);
+							c3 = _mm_avg_epu16(c2__g_, c3__g_);
+							c5 = _mm_avg_epu16(c4__g_, c5__g_);
+							c7 = _mm_avg_epu16(c6__g_, c7__g_);
+							c1 = _mm_avg_epu16(c1, c3);
+							c5 = _mm_avg_epu16(c5, c7);
+							c1 = _mm_avg_epu16(c1, c5);
+							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							c0 = _mm_or_si128(c0, c1);
+
+							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.depth == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x += 8)
+						{
+							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
+							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
+							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
+							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
+							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
+							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
+							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
+							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
+
+							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
+							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
+							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
+							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
+							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
+
+							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
+							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
+							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
+							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
+							c0 = _mm_avg_epu8(c0, c2);
+							c4 = _mm_avg_epu8(c4, c6);
+							c8 = _mm_avg_epu8(c8, cA);
+							cC = _mm_avg_epu8(cC, cE);
+							c0 = _mm_avg_epu8(c0, c4);
+							c8 = _mm_avg_epu8(c8, cC);
+							c0 = _mm_avg_epu8(c0, c8);
+							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+							c1 = _mm_avg_epu16(c0__g_, c1__g_);
+							c3 = _mm_avg_epu16(c2__g_, c3__g_);
+							c5 = _mm_avg_epu16(c4__g_, c5__g_);
+							c7 = _mm_avg_epu16(c6__g_, c7__g_);
+							c9 = _mm_avg_epu16(c8__g_, c9__g_);
+							cB = _mm_avg_epu16(cA__g_, cB__g_);
+							cD = _mm_avg_epu16(cC__g_, cD__g_);
+							cF = _mm_avg_epu16(cE__g_, cF__g_);
+							c1 = _mm_avg_epu8(c1, c3);
+							c5 = _mm_avg_epu8(c5, c7);
+							c9 = _mm_avg_epu8(c9, cB);
+							cD = _mm_avg_epu8(cD, cF);
+							c1 = _mm_avg_epu8(c1, c5);
+							c9 = _mm_avg_epu8(c9, cD);
+							c1 = _mm_avg_epu8(c1, c9);
+							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+							c0 = _mm_or_si128(c0, c1);
+
+							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+			}
+			else
+			{
+				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
+
+				if(internal.depth == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.depth == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c0 = AVERAGE(c0, c2);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.depth == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c0 = AVERAGE(c0, c4);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.depth == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
+							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
+							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
+							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
+							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
+							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
+							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
+							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c8 = AVERAGE(c8, c9);
+							cA = AVERAGE(cA, cB);
+							cC = AVERAGE(cC, cD);
+							cE = AVERAGE(cE, cF);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c8 = AVERAGE(c8, cA);
+							cC = AVERAGE(cC, cE);
+							c0 = AVERAGE(c0, c4);
+							c8 = AVERAGE(c8, cC);
+							c0 = AVERAGE(c0, c8);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+
+				#undef AVERAGE
+			}
+		}
 		else
 		{
 		//	UNIMPLEMENTED();