Optimize replication.

Since floating-point scalars are stored in vector registers, Subzero
allows us to just bitcast between them, eliminating a load and insert
before shuffling.

Change-Id: Ibccf242fd4cfc28604f35f420a04fd4ee6eabe52
Reviewed-on: https://swiftshader-review.googlesource.com/8575
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 5468cb9..e5208cf 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2928,9 +2928,9 @@
 
 	Short4::Short4(RValue<Int4> cast)
 	{
-		int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-		Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
-		Value *packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
+		int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+		Value *packed = Nucleus::createShuffleVector(short8, short8, select);
 
 		Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
 		Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
@@ -4691,7 +4691,7 @@
 
 	RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
 	{
-		int shuffle[16] = {0, 4, 1, 5};   // Real type is v4i32
+		int shuffle[4] = {0, 4, 1, 5};   // Real type is v4i32
 		auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
 		return As<Short4>(Swizzle(lowHigh, 0xEE));
 	}
@@ -5008,11 +5008,10 @@
 
 	Int4::Int4(RValue<Int> rhs)
 	{
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+		Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
 
 		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
 
 		storeValue(replicate);
 	}
@@ -5908,11 +5907,10 @@
 
 	Float4::Float4(RValue<Float> rhs) : FloatXYZW(this)
 	{
-		Value *vector = loadValue();
-		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+		Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
 
 		int swizzle[4] = {0, 0, 0, 0};
-		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+		Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
 
 		storeValue(replicate);
 	}