Fix ARM compilation. This does not provide full support for ARM, but merely makes things (statically) compile. Bug b/37478805 Change-Id: I01d1d84e396c04c84e74d521946595014d2eafb5 Reviewed-on: https://swiftshader-review.googlesource.com/9430 Reviewed-by: Nicolas Capens <capn@google.com> Tested-by: Nicolas Capens <capn@google.com>

commit: 47dc8676fcff01274de3d3fe98e1de607ff076d3 [log] [tgz]
author: Nicolas Capens <capn@google.com> Tue Apr 25 12:54:39 2017 -0400
committer: Nicolas Capens <capn@google.com> Tue Apr 25 19:31:28 2017 +0000
tree: 10bbff3e03d01439f2b2b9bdb9f52178e1e696bc
parent: 1cc44388266a519499c582a7d78cb5c6cfbcd5bf [diff]
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index 7d75217..4eb77ca 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp

@@ -25,8 +25,10 @@
 #include "Common/Debug.hpp"
 #include "Reactor/Reactor.hpp"
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#if defined(__i386__) || defined(__x86_64__)
+	#include <xmmintrin.h>
+	#include <emmintrin.h>
+#endif
 
 #undef min
 #undef max
@@ -3104,33 +3106,35 @@
 			bytes -= 2;
 		}
 
-		if(CPUID::supportsSSE())
-		{
-			while((size_t)buffer & 0xF && bytes >= 4)
+		#if defined(__i386__) || defined(__x86_64__)
+			if(CPUID::supportsSSE())
 			{
-				*(int*)buffer = pattern;
-				(int*&)buffer += 1;
-				bytes -= 4;
+				while((size_t)buffer & 0xF && bytes >= 4)
+				{
+					*(int*)buffer = pattern;
+					(int*&)buffer += 1;
+					bytes -= 4;
+				}
+
+				__m128 quad = _mm_set_ps1((float&)pattern);
+
+				float *pointer = (float*)buffer;
+				int qxwords = bytes / 64;
+				bytes -= qxwords * 64;
+
+				while(qxwords--)
+				{
+					_mm_stream_ps(pointer + 0, quad);
+					_mm_stream_ps(pointer + 4, quad);
+					_mm_stream_ps(pointer + 8, quad);
+					_mm_stream_ps(pointer + 12, quad);
+
+					pointer += 16;
+				}
+
+				buffer = pointer;
 			}
-
-			__m128 quad = _mm_set_ps1((float&)pattern);
-
-			float *pointer = (float*)buffer;
-			int qxwords = bytes / 64;
-			bytes -= qxwords * 64;
-
-			while(qxwords--)
-			{
-				_mm_stream_ps(pointer + 0, quad);
-				_mm_stream_ps(pointer + 4, quad);
-				_mm_stream_ps(pointer + 8, quad);
-				_mm_stream_ps(pointer + 12, quad);
-
-				pointer += 16;
-			}
-
-			buffer = pointer;
-		}
+		#endif
 
 		while(bytes >= 4)
 		{
@@ -3771,149 +3775,151 @@
 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
 		{
-			if(CPUID::supportsSSE2() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
+								c0 = _mm_avg_epu8(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c0 = _mm_avg_epu8(c0, c2);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c0 = _mm_avg_epu8(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c4 = _mm_avg_epu8(c4, c5);
-							c6 = _mm_avg_epu8(c6, c7);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c0 = _mm_avg_epu8(c0, c4);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
 
-							c0 = _mm_avg_epu8(c0, c1);
-							c2 = _mm_avg_epu8(c2, c3);
-							c4 = _mm_avg_epu8(c4, c5);
-							c6 = _mm_avg_epu8(c6, c7);
-							c8 = _mm_avg_epu8(c8, c9);
-							cA = _mm_avg_epu8(cA, cB);
-							cC = _mm_avg_epu8(cC, cD);
-							cE = _mm_avg_epu8(cE, cF);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c8 = _mm_avg_epu8(c8, cA);
-							cC = _mm_avg_epu8(cC, cE);
-							c0 = _mm_avg_epu8(c0, c4);
-							c8 = _mm_avg_epu8(c8, cC);
-							c0 = _mm_avg_epu8(c0, c8);
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c8 = _mm_avg_epu8(c8, c9);
+								cA = _mm_avg_epu8(cA, cB);
+								cC = _mm_avg_epu8(cC, cD);
+								cE = _mm_avg_epu8(cE, cF);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
 
@@ -4062,149 +4068,152 @@
 		}
 		else if(internal.format == FORMAT_G16R16)
 		{
-			if(CPUID::supportsSSE2() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
+								c0 = _mm_avg_epu16(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c0 = _mm_avg_epu16(c0, c2);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c0 = _mm_avg_epu16(c0, c4);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c8 = _mm_avg_epu16(c8, c9);
-							cA = _mm_avg_epu16(cA, cB);
-							cC = _mm_avg_epu16(cC, cD);
-							cE = _mm_avg_epu16(cE, cF);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c8 = _mm_avg_epu16(c8, cA);
-							cC = _mm_avg_epu16(cC, cE);
-							c0 = _mm_avg_epu16(c0, c4);
-							c8 = _mm_avg_epu16(c8, cC);
-							c0 = _mm_avg_epu16(c0, c8);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
@@ -4353,149 +4362,151 @@
 		}
 		else if(internal.format == FORMAT_A16B16G16R16)
 		{
-			if(CPUID::supportsSSE2() && (width % 2) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 2) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
+								c0 = _mm_avg_epu16(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c0 = _mm_avg_epu16(c0, c2);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c0 = _mm_avg_epu16(c0, c4);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
 
-							c0 = _mm_avg_epu16(c0, c1);
-							c2 = _mm_avg_epu16(c2, c3);
-							c4 = _mm_avg_epu16(c4, c5);
-							c6 = _mm_avg_epu16(c6, c7);
-							c8 = _mm_avg_epu16(c8, c9);
-							cA = _mm_avg_epu16(cA, cB);
-							cC = _mm_avg_epu16(cC, cD);
-							cE = _mm_avg_epu16(cE, cF);
-							c0 = _mm_avg_epu16(c0, c2);
-							c4 = _mm_avg_epu16(c4, c6);
-							c8 = _mm_avg_epu16(c8, cA);
-							cC = _mm_avg_epu16(cC, cE);
-							c0 = _mm_avg_epu16(c0, c4);
-							c8 = _mm_avg_epu16(c8, cC);
-							c0 = _mm_avg_epu16(c0, c8);
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
 
-							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
 
@@ -4644,153 +4655,155 @@
 		}
 		else if(internal.format == FORMAT_R32F)
 		{
-			if(CPUID::supportsSSE() && (width % 4) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 4) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 4)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 4 * x), c0);
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -4939,153 +4952,155 @@
 		}
 		else if(internal.format == FORMAT_G32R32F)
 		{
-			if(CPUID::supportsSSE() && (width % 2) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 2) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 2)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 8 * x), c0);
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -5234,153 +5249,155 @@
 		}
 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
 		{
-			if(CPUID::supportsSSE())
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE())
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c0 = _mm_add_ps(c0, c2);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c0 = _mm_add_ps(c0, c4);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x++)
+						for(int y = 0; y < height; y++)
 						{
-							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
-							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
-							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
-							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
-							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
-							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
-							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
-							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
-							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
-							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
-							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
-							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
-							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
-							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
-							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
-							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
 
-							c0 = _mm_add_ps(c0, c1);
-							c2 = _mm_add_ps(c2, c3);
-							c4 = _mm_add_ps(c4, c5);
-							c6 = _mm_add_ps(c6, c7);
-							c8 = _mm_add_ps(c8, c9);
-							cA = _mm_add_ps(cA, cB);
-							cC = _mm_add_ps(cC, cD);
-							cE = _mm_add_ps(cE, cF);
-							c0 = _mm_add_ps(c0, c2);
-							c4 = _mm_add_ps(c4, c6);
-							c8 = _mm_add_ps(c8, cA);
-							cC = _mm_add_ps(cC, cE);
-							c0 = _mm_add_ps(c0, c4);
-							c8 = _mm_add_ps(c8, cC);
-							c0 = _mm_add_ps(c0, c8);
-							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
 
-							_mm_store_ps((float*)(source0 + 16 * x), c0);
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				if(internal.depth == 2)
 				{
@@ -5529,259 +5546,261 @@
 		}
 		else if(internal.format == FORMAT_R5G6B5)
 		{
-			if(CPUID::supportsSSE2() && (width % 8) == 0)
-			{
-				if(internal.depth == 2)
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 8) == 0)
 				{
-					for(int y = 0; y < height; y++)
+					if(internal.depth == 2)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
 					}
-				}
-				else if(internal.depth == 4)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 4)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c1 = _mm_avg_epu16(c1, c3);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
 					}
-				}
-				else if(internal.depth == 8)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 8)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
-							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c0 = _mm_avg_epu8(c0, c4);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c5 = _mm_avg_epu16(c4__g_, c5__g_);
-							c7 = _mm_avg_epu16(c6__g_, c7__g_);
-							c1 = _mm_avg_epu16(c1, c3);
-							c5 = _mm_avg_epu16(c5, c7);
-							c1 = _mm_avg_epu16(c1, c5);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c5 = _mm_avg_epu16(c5, c7);
+								c1 = _mm_avg_epu16(c1, c5);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
 					}
-				}
-				else if(internal.depth == 16)
-				{
-					for(int y = 0; y < height; y++)
+					else if(internal.depth == 16)
 					{
-						for(int x = 0; x < width; x += 8)
+						for(int y = 0; y < height; y++)
 						{
-							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
-							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
-							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
-							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
-							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
-							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
-							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
-							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
-							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
-							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
-							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
-							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
-							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
-							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
-							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
-							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
 
-							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
-							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
-							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
-							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
-							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
-							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
-							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
 
-							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
-							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
-							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
-							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
-							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
-							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
-							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
-							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
-							c0 = _mm_avg_epu8(c0, c2);
-							c4 = _mm_avg_epu8(c4, c6);
-							c8 = _mm_avg_epu8(c8, cA);
-							cC = _mm_avg_epu8(cC, cE);
-							c0 = _mm_avg_epu8(c0, c4);
-							c8 = _mm_avg_epu8(c8, cC);
-							c0 = _mm_avg_epu8(c0, c8);
-							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
-							c1 = _mm_avg_epu16(c0__g_, c1__g_);
-							c3 = _mm_avg_epu16(c2__g_, c3__g_);
-							c5 = _mm_avg_epu16(c4__g_, c5__g_);
-							c7 = _mm_avg_epu16(c6__g_, c7__g_);
-							c9 = _mm_avg_epu16(c8__g_, c9__g_);
-							cB = _mm_avg_epu16(cA__g_, cB__g_);
-							cD = _mm_avg_epu16(cC__g_, cD__g_);
-							cF = _mm_avg_epu16(cE__g_, cF__g_);
-							c1 = _mm_avg_epu8(c1, c3);
-							c5 = _mm_avg_epu8(c5, c7);
-							c9 = _mm_avg_epu8(c9, cB);
-							cD = _mm_avg_epu8(cD, cF);
-							c1 = _mm_avg_epu8(c1, c5);
-							c9 = _mm_avg_epu8(c9, cD);
-							c1 = _mm_avg_epu8(c1, c9);
-							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
-							c0 = _mm_or_si128(c0, c1);
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
+								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
+								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
+								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c9 = _mm_avg_epu16(c8__g_, c9__g_);
+								cB = _mm_avg_epu16(cA__g_, cB__g_);
+								cD = _mm_avg_epu16(cC__g_, cD__g_);
+								cF = _mm_avg_epu16(cE__g_, cF__g_);
+								c1 = _mm_avg_epu8(c1, c3);
+								c5 = _mm_avg_epu8(c5, c7);
+								c9 = _mm_avg_epu8(c9, cB);
+								cD = _mm_avg_epu8(cD, cF);
+								c1 = _mm_avg_epu8(c1, c5);
+								c9 = _mm_avg_epu8(c9, cD);
+								c1 = _mm_avg_epu8(c1, c9);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
 
-							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
 						}
-
-						source0 += pitch;
-						source1 += pitch;
-						source2 += pitch;
-						source3 += pitch;
-						source4 += pitch;
-						source5 += pitch;
-						source6 += pitch;
-						source7 += pitch;
-						source8 += pitch;
-						source9 += pitch;
-						sourceA += pitch;
-						sourceB += pitch;
-						sourceC += pitch;
-						sourceD += pitch;
-						sourceE += pitch;
-						sourceF += pitch;
 					}
+					else ASSERT(false);
 				}
-				else ASSERT(false);
-			}
-			else
+				else
+			#endif
 			{
 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
commit	47dc8676fcff01274de3d3fe98e1de607ff076d3	[log] [tgz]
author	Nicolas Capens <capn@google.com>	Tue Apr 25 12:54:39 2017 -0400
committer	Nicolas Capens <capn@google.com>	Tue Apr 25 19:31:28 2017 +0000
tree	10bbff3e03d01439f2b2b9bdb9f52178e1e696bc
parent	1cc44388266a519499c582a7d78cb5c6cfbcd5bf [diff]