[AMDGPU] Improve reciprocal handling When denormals are supported we are producing a full division for 1.0f / x. That still can be replaced by the faster version: bool c = fabs(x) > 0x1.0p+96f; float s = c ? 0x1.0p-32f : 1.0f; x *= s; return s * v_rcp_f32(x) in case if requested accuracy is 2.5ulp or less. The same version is used if denormals are not supported for non 1.0 numerators, where just v_rcp_f32 is then used for 1.0 numerator. The optimization of 1/x is extended to the case -1/x, which is the same except for the resulting sign bit. OpenCL conformance passed with both enabled and disabled denorms. Differential Revision: https://reviews.llvm.org/D47805 llvm-svn: 334142

commit: df61be70b23460574d7a2875c897643daada9eee [log] [tgz]
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Wed Jun 06 22:22:32 2018 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Wed Jun 06 22:22:32 2018 +0000
tree: 860c6a57a60bbdb0f3ebb880eaeeecf8308fe27a
parent: 566b402a131c7a045bb5d4cef8e3131440d5aae2 [diff] [blame]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index f0b9c9e..53fb9e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

@@ -372,13 +372,18 @@
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
-    return false;
+    return HasDenormals;
+
+  if (UnsafeDiv)
+    return true;
+
+  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
 
   // Reciprocal f32 is handled separately without denormals.
-  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+  return HasDenormals ^ IsOne;
 }
 
 // Insert an intrinsic for fast fdiv for safe math situations where we can
@@ -404,7 +409,7 @@
                                       FMF.allowReciprocal();
 
   // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (ST->hasFP32Denormals() || UnsafeDiv)
+  if (UnsafeDiv)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
@@ -418,6 +423,7 @@
 
   Value *NewFDiv = nullptr;
 
+  bool HasDenormals = ST->hasFP32Denormals();
   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     NewFDiv = UndefValue::get(VT);
 
@@ -428,7 +434,7 @@
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
       Value *NewElt;
 
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
       } else {
         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -437,7 +443,7 @@
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
   } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
   }
 
@@ -447,7 +453,7 @@
     FDiv.eraseFromParent();
   }
 
-  return true;
+  return !!NewFDiv;
 }
 
 static bool hasUnsafeFPMath(const Function &F) {
commit	df61be70b23460574d7a2875c897643daada9eee	[log] [tgz]
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Wed Jun 06 22:22:32 2018 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Wed Jun 06 22:22:32 2018 +0000
tree	860c6a57a60bbdb0f3ebb880eaeeecf8308fe27a
parent	566b402a131c7a045bb5d4cef8e3131440d5aae2 [diff] [blame]