Matrix determinant and inverse implementation

Implementation for determinant has been done directly in
ShaderCore in order to avoid having to allocate temporaries
manually in OutputASM.

For now, the implementation for the inverse matrix is very
simple, i.e., it doesn't attempt to re-use results from the
cofactor matrix computation to compute the determinant or
do any other kind of optimization, but it works.

Change-Id: I0fc70133809ae2752dc567bf58b60d7af7a88009
Reviewed-on: https://swiftshader-review.googlesource.com/4000
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/OpenGL/compiler/OutputASM.cpp b/src/OpenGL/compiler/OutputASM.cpp
index 85b6cca..3ed7975 100644
--- a/src/OpenGL/compiler/OutputASM.cpp
+++ b/src/OpenGL/compiler/OutputASM.cpp
@@ -636,6 +636,74 @@
 		return true;

 	}

 

+	void OutputASM::emitDeterminant(TIntermTyped *result, TIntermTyped *arg, int size, int col, int row, int outCol, int outRow)

+	{

+		switch(size)

+		{

+		case 1: // Used for cofactor computation only

+			{

+				// For a 2x2 matrix, the cofactor is simply a transposed move or negate

+				bool isMov = (row == col);

+				sw::Shader::Opcode op = isMov ? sw::Shader::OPCODE_MOV : sw::Shader::OPCODE_NEG;

+				Instruction *mov = emit(op, result, arg);

+				mov->src[0].index += isMov ? 1 - row : row;

+				mov->src[0].swizzle = 0x55 * (isMov ? 1 - col : col);

+				mov->dst.index += outCol;

+				mov->dst.mask = 1 << outRow;

+			}

+			break;

+		case 2:

+			{

+				static const unsigned int swizzle[3] = { 0x99, 0x88, 0x44 }; // xy?? : yzyz, xzxz, xyxy

+

+				bool isCofactor = (col >= 0) && (row >= 0);

+				int col0 = (isCofactor && (col <= 0)) ? 1 : 0;

+				int col1 = (isCofactor && (col <= 1)) ? 2 : 1;

+				bool negate = isCofactor && ((col & 0x01) ^ (row & 0x01));

+

+				Instruction *det = emit(sw::Shader::OPCODE_DET2, result, arg, arg);

+				det->src[0].index += negate ? col1 : col0;

+				det->src[1].index += negate ? col0 : col1;

+				det->src[0].swizzle = det->src[1].swizzle = swizzle[isCofactor ? row : 2];

+				det->dst.index += outCol;

+				det->dst.mask = 1 << outRow;

+			}

+			break;

+		case 3:

+			{

+				static const unsigned int swizzle[4] = { 0xF9, 0xF8, 0xF4, 0xE4 }; // xyz? : yzww, xzww, xyww, xyzw

+

+				bool isCofactor = (col >= 0) && (row >= 0);

+				int col0 = (isCofactor && (col <= 0)) ? 1 : 0;

+				int col1 = (isCofactor && (col <= 1)) ? 2 : 1;

+				int col2 = (isCofactor && (col <= 2)) ? 3 : 2;

+				bool negate = isCofactor && ((col & 0x01) ^ (row & 0x01));

+

+				Instruction *det = emit(sw::Shader::OPCODE_DET3, result, arg, arg, arg);

+				det->src[0].index += col0;

+				det->src[1].index += negate ? col2 : col1;

+				det->src[2].index += negate ? col1 : col2;

+				det->src[0].swizzle = det->src[1].swizzle = det->src[2].swizzle = swizzle[isCofactor ? row : 3];

+				det->dst.index += outCol;

+				det->dst.mask = 1 << outRow;

+			}

+			break;

+		case 4:

+			{

+				Instruction *det = emit(sw::Shader::OPCODE_DET4, result, arg, arg, arg, arg);

+				det->src[1].index += 1;

+				det->src[2].index += 2;

+				det->src[3].index += 3;

+				det->dst.index += outCol;

+				det->dst.mask = 1 << outRow;

+			}

+			break;

+		default:

+			UNREACHABLE(size);

+			break;

+		}

+	}

+

 	bool OutputASM::visitUnary(Visit visit, TIntermUnary *node)

 	{

 		if(currentScope != emitScope)

@@ -807,6 +875,48 @@
 				}

 			}

 			break;

+		case EOpDeterminant:

+			if(visit == PostVisit)

+			{

+				int size = arg->getNominalSize();

+				ASSERT(size == arg->getSecondarySize());

+

+				emitDeterminant(result, arg, size);

+			}

+			break;

+		case EOpInverse:

+			if(visit == PostVisit)

+			{

+				int size = arg->getNominalSize();

+				ASSERT(size == arg->getSecondarySize());

+

+				// Compute transposed matrix of cofactors

+				for(int i = 0; i < size; ++i)

+				{

+					for(int j = 0; j < size; ++j)

+					{

+						// For a 2x2 matrix, the cofactor is simply a transposed move or negate

+						// For a 3x3 or 4x4 matrix, the cofactor is a transposed determinant

+						emitDeterminant(result, arg, size - 1, j, i, i, j);

+					}

+				}

+

+				// Compute 1 / determinant

+				Temporary invDet(this);

+				emitDeterminant(&invDet, arg, size);

+				Constant one(1.0f, 1.0f, 1.0f, 1.0f);

+				Instruction *div = emit(sw::Shader::OPCODE_DIV, &invDet, &one, &invDet);

+				div->src[1].swizzle = 0x00; // xxxx

+

+				// Divide transposed matrix of cofactors by determinant

+				for(int i = 0; i < size; ++i)

+				{

+					Instruction *div = emit(sw::Shader::OPCODE_MUL, result, result, &invDet);

+					div->src[0].index += i;

+					div->dst.index += i;

+				}

+			}

+			break;

 		default: UNREACHABLE(node->getOp());

 		}

 

@@ -1493,7 +1603,7 @@
 		return IsSampler(type.getBasicType()) && (type.getQualifier() == EvqUniform || type.getQualifier() == EvqTemporary);

 	}

 

-	Instruction *OutputASM::emit(sw::Shader::Opcode op, TIntermTyped *dst, TIntermNode *src0, TIntermNode *src1, TIntermNode *src2, int index)

+	Instruction *OutputASM::emit(sw::Shader::Opcode op, TIntermTyped *dst, TIntermNode *src0, TIntermNode *src1, TIntermNode *src2, TIntermNode *src3, int index)

 	{

 		if(isSamplerRegister(dst))

 		{

@@ -1513,6 +1623,7 @@
 		argument(instruction->src[0], src0, index);

 		argument(instruction->src[1], src1, index);

 		argument(instruction->src[2], src2, index);

+		argument(instruction->src[3], src3, index);

 

 		shader->append(instruction);

 

@@ -1568,7 +1679,7 @@
 	{

 		for(int index = 0; index < dst->elementRegisterCount(); index++)

 		{

-			emit(op, dst, src0, src1, src2, index);

+			emit(op, dst, src0, src1, src2, 0, index);

 		}

 	}

 

diff --git a/src/OpenGL/compiler/OutputASM.h b/src/OpenGL/compiler/OutputASM.h
index 4e3a641..0ea4811 100644
--- a/src/OpenGL/compiler/OutputASM.h
+++ b/src/OpenGL/compiler/OutputASM.h
@@ -176,11 +176,12 @@
 		virtual bool visitBranch(Visit visit, TIntermBranch*);

 

 		sw::Shader::Opcode getOpcode(sw::Shader::Opcode op, TIntermTyped *in) const;

-		Instruction *emit(sw::Shader::Opcode op, TIntermTyped *dst = 0, TIntermNode *src0 = 0, TIntermNode *src1 = 0, TIntermNode *src2 = 0, int index = 0);

+		Instruction *emit(sw::Shader::Opcode op, TIntermTyped *dst = 0, TIntermNode *src0 = 0, TIntermNode *src1 = 0, TIntermNode *src2 = 0, TIntermNode *src3 = 0, int index = 0);

 		Instruction *emitCast(TIntermTyped *dst, TIntermTyped *src);

 		void emitBinary(sw::Shader::Opcode op, TIntermTyped *dst = 0, TIntermNode *src0 = 0, TIntermNode *src1 = 0, TIntermNode *src2 = 0);

 		void emitAssign(sw::Shader::Opcode op, TIntermTyped *result, TIntermTyped *lhs, TIntermTyped *src0, TIntermTyped *src1 = 0);

 		void emitCmp(sw::Shader::Control cmpOp, TIntermTyped *dst, TIntermNode *left, TIntermNode *right, int index = 0);

+		void emitDeterminant(TIntermTyped *result, TIntermTyped *arg, int size, int col = -1, int row = -1, int outCol = 0, int outRow = 0);

 		void argument(sw::Shader::SourceParameter &parameter, TIntermNode *argument, int index = 0);

 		void copy(TIntermTyped *dst, TIntermNode *src, int offset = 0);

 		void assignLvalue(TIntermTyped *dst, TIntermTyped *src);

diff --git a/src/Shader/PixelProgram.cpp b/src/Shader/PixelProgram.cpp
index df629d9..aca41bd 100644
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -171,6 +171,9 @@
 			case Shader::OPCODE_DP2ADD:     dp2add(d, s0, s1, s2);                         break;
 			case Shader::OPCODE_DP3:        dp3(d, s0, s1);                                break;
 			case Shader::OPCODE_DP4:        dp4(d, s0, s1);                                break;
+			case Shader::OPCODE_DET2:       det2(d, s0, s1);                               break;
+			case Shader::OPCODE_DET3:       det3(d, s0, s1, s2);                           break;
+			case Shader::OPCODE_DET4:       det4(d, s0, s1, s2, s3);                       break;
 			case Shader::OPCODE_CMP0:       cmp0(d, s0, s1, s2);                           break;
 			case Shader::OPCODE_ICMP:       icmp(d, s0, s1, control);                      break;
 			case Shader::OPCODE_UCMP:       ucmp(d, s0, s1, control);                      break;
diff --git a/src/Shader/Shader.cpp b/src/Shader/Shader.cpp
index 11c3f20..f92ee73 100644
--- a/src/Shader/Shader.cpp
+++ b/src/Shader/Shader.cpp
@@ -764,6 +764,9 @@
 		case OPCODE_DIST4:			return "dist4";
 		case OPCODE_DP3:			return "dp3";
 		case OPCODE_DP4:			return "dp4";
+		case OPCODE_DET2:			return "det2";
+		case OPCODE_DET3:			return "det3";
+		case OPCODE_DET4:			return "det4";
 		case OPCODE_MIN:			return "min";
 		case OPCODE_IMIN:			return "imin";
 		case OPCODE_UMIN:			return "umin";
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index 339279f..094d3b7 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -188,6 +188,9 @@
 			OPCODE_B2I,   // Bool to int

 			OPCODE_U2B,   // Uint to bool

 			OPCODE_B2U,   // Bool to uint

+			OPCODE_DET2,

+			OPCODE_DET3,

+			OPCODE_DET4,

 			OPCODE_ALL,

 			OPCODE_ANY,

 			OPCODE_NEG,

diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index 406b038..6ad3953 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -1139,6 +1139,34 @@
 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
 	}
 
+	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x * src1.y - src0.y * src1.x;
+		dst.y = dst.z = dst.w = dst.x;
+	}
+
+	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		crs(dst, src1, src2);
+		dp3(dst, dst, src0);
+	}
+
+	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
+	{
+		dst.x = src2.z * src3.w - src2.w * src3.z;
+		dst.y = src1.w * src3.z - src1.z * src3.w;
+		dst.z = src1.z * src2.w - src1.w * src2.z;
+		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
+		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
+		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
+		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
+		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
+		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
+		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
+		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
+		dst.y = dst.z = dst.w = dst.x;
+	}
+
 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
 	{
 		dst.x = Frac(src.x);
diff --git a/src/Shader/ShaderCore.hpp b/src/Shader/ShaderCore.hpp
index c3308aa..565d682 100644
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -284,6 +284,9 @@
 		void dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);

 		void dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

 		void dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

+		void det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

+		void det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);

+		void det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3);

 		void min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

 		void imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

 		void umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index 35581af..2040480 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -102,6 +102,7 @@
 			Src src0 = instruction->src[0];
 			Src src1 = instruction->src[1];
 			Src src2 = instruction->src[2];
+			Src src3 = instruction->src[3];
 
 			bool predicate = instruction->predicate;
 			Control control = instruction->control;
@@ -112,10 +113,12 @@
 			Vector4f s0;
 			Vector4f s1;
 			Vector4f s2;
+			Vector4f s3;
 
 			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegisterF(r, src0);
 			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegisterF(r, src1);
 			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegisterF(r, src2);
+			if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegisterF(r, src3);
 
 			switch(opcode)
 			{
@@ -151,6 +154,9 @@
 			case Shader::OPCODE_DP2:		dp2(d, s0, s1);					break;
 			case Shader::OPCODE_DP3:		dp3(d, s0, s1);					break;
 			case Shader::OPCODE_DP4:		dp4(d, s0, s1);					break;
+			case Shader::OPCODE_DET2:       det2(d, s0, s1);                break;
+			case Shader::OPCODE_DET3:       det3(d, s0, s1, s2);            break;
+			case Shader::OPCODE_DET4:       det4(d, s0, s1, s2, s3);        break;
 			case Shader::OPCODE_ATT:		att(d, s0, s1);					break;
 			case Shader::OPCODE_EXP2X:		exp2x(d, s0, pp);				break;
 			case Shader::OPCODE_EXP2:		exp2(d, s0, pp);				break;