Optimize replication.
Since floating-point scalars are stored in vector registers, Subzero
allows us to just bitcast between them, eliminating a load and insert
before shuffling.
Change-Id: Ibccf242fd4cfc28604f35f420a04fd4ee6eabe52
Reviewed-on: https://swiftshader-review.googlesource.com/8575
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 5468cb9..e5208cf 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2928,9 +2928,9 @@
Short4::Short4(RValue<Int4> cast)
{
- int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
- Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
- Value *packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
+ int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+ Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
+ Value *packed = Nucleus::createShuffleVector(short8, short8, select);
Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
@@ -4691,7 +4691,7 @@
RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
{
- int shuffle[16] = {0, 4, 1, 5}; // Real type is v4i32
+ int shuffle[4] = {0, 4, 1, 5}; // Real type is v4i32
auto lowHigh = RValue<Int4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
return As<Short4>(Swizzle(lowHigh, 0xEE));
}
@@ -5008,11 +5008,10 @@
Int4::Int4(RValue<Int> rhs)
{
- Value *vector = loadValue();
- Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+ Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
int swizzle[4] = {0, 0, 0, 0};
- Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+ Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
storeValue(replicate);
}
@@ -5908,11 +5907,10 @@
Float4::Float4(RValue<Float> rhs) : FloatXYZW(this)
{
- Value *vector = loadValue();
- Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
+ Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
int swizzle[4] = {0, 0, 0, 0};
- Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
+ Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
storeValue(replicate);
}