Half float packing and unpacking intrinsic functions
Implementation for packHalf2x16, unpackHalf2x16 intrinsic functions.
Change-Id: I55212f8bc2ecd30e0108858d74117c3cf60733ed
Reviewed-on: https://swiftshader-review.googlesource.com/5056
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Shader/PixelProgram.cpp b/src/Shader/PixelProgram.cpp
index d39716f..b005863 100644
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -230,8 +230,10 @@
case Shader::OPCODE_UINTBITSTOFLOAT: d = s0; break;
case Shader::OPCODE_PACKSNORM2x16: packSnorm2x16(d, s0); break;
case Shader::OPCODE_PACKUNORM2x16: packUnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKHALF2x16: packHalf2x16(d, s0); break;
case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0); break;
case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKHALF2x16: unpackHalf2x16(d, s0); break;
case Shader::OPCODE_POWX: powx(d, s0, s1, pp); break;
case Shader::OPCODE_POW: pow(d, s0, s1, pp); break;
case Shader::OPCODE_SGN: sgn(d, s0); break;
diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index 4f5af1a..b978d9e 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -1123,6 +1123,57 @@
Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
}
+ void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
+ {
+ static const uint32_t mask_sign = 0x80000000u;
+ static const uint32_t mask_round = ~0xfffu;
+ static const uint32_t c_f32infty = 255 << 23;
+ static const uint32_t c_magic = 15 << 23;
+ static const uint32_t c_nanbit = 0x200;
+ static const uint32_t c_infty_as_fp16 = 0x7c00;
+ static const uint32_t c_clamp = (31 << 23) - 0x1000;
+
+ UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
+ UInt4 absf = As<UInt4>(floatBits) ^ justsign;
+ UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
+
+ // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+ // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+ UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
+ As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
+ ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
+ UInt4(c_infty_as_fp16)));
+
+ dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
+ }
+
+ void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
+ {
+ static const uint32_t mask_nosign = 0x7FFF;
+ static const uint32_t magic = (254 - 15) << 23;
+ static const uint32_t was_infnan = 0x7BFF;
+ static const uint32_t exp_infnan = 255 << 23;
+
+ UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
+ dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+ ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
+ (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
+ }
+
+ void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
+ {
+ // half2 | half1
+ floatToHalfBits(d.x, s0.x, false);
+ floatToHalfBits(d.x, s0.y, true);
+ }
+
+ void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
+ {
+ // half2 | half1
+ halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
+ halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
+ }
+
void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
{
// round(clamp(c, -1.0, 1.0) * 32767.0)
diff --git a/src/Shader/ShaderCore.hpp b/src/Shader/ShaderCore.hpp
index 34c319e..7eeec42 100644
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -313,6 +313,8 @@
void att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
void lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
void smooth(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+ void packHalf2x16(Vector4f &dst, const Vector4f &src);
+ void unpackHalf2x16(Vector4f &dst, const Vector4f &src);
void packSnorm2x16(Vector4f &dst, const Vector4f &src);
void packUnorm2x16(Vector4f &dst, const Vector4f &src);
void unpackSnorm2x16(Vector4f &dst, const Vector4f &src);
@@ -383,6 +385,8 @@
void cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
void cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
void select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2);
+ void floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits);
+ void halfToFloatBits(Float4& dst, const Float4& halfBits);
};
}
diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index c05ace3..1989c87 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -205,8 +205,10 @@
case Shader::OPCODE_UINTBITSTOFLOAT: d = s0; break;
case Shader::OPCODE_PACKSNORM2x16: packSnorm2x16(d, s0); break;
case Shader::OPCODE_PACKUNORM2x16: packUnorm2x16(d, s0); break;
+ case Shader::OPCODE_PACKHALF2x16: packHalf2x16(d, s0); break;
case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0); break;
case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0); break;
+ case Shader::OPCODE_UNPACKHALF2x16: unpackHalf2x16(d, s0); break;
case Shader::OPCODE_M3X2: M3X2(d, s0, src1); break;
case Shader::OPCODE_M3X3: M3X3(d, s0, src1); break;
case Shader::OPCODE_M3X4: M3X4(d, s0, src1); break;