ARM64: FP16.floor() intrinsic for ARMv8 This CL implements an intrinsic for floor() method with ARMv8.2 FP16 instructions. This intrinsic calls a template GenerateFP16Round function which will be used to implement other intrinisics such as ceil and rint. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.floor(). The time required in milliseconds to execute the below code on Pixel3: - Java implementation android.util.Half.floor(): - big cluster only: 18623 - little cluster only: 60424 - arm64 Intrinisic implementation: - big cluster only: 14213 (~24% faster) - little cluster only: 54398 (~10% faster) Analysis of this function with simpleperf showed that approximately only 60-65% of the time is spent in libcore.util.FP16.floor. So the percentage improvement using intrinsics is likely to be more than the numbers stated above. Another reason that the performance improvement with intrinsic is lower than expected is because the java implementation for values between -1 and 1 (abs < 0x3c00) only requires a few instructions and should almost give a similar performance to the intrinsic in this case. In the benchmark function below, 46.8% of the values tested are between -1 and 1. public static short benchmarkFloor(){ short ret = 0; long before = 0; long after = 0; before = System.currentTimeMillis(); for(int i = 0; i < 50000; i++){ for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) { ret += FP16.floor(h); } } after = System.currentTimeMillis(); System.out.println("Time of FP16.floor (ms): " + (after - before)); System.out.println(ret); return ret; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: Iad1dd032d456af54932f13c5cf27228f8652a0b5

commit: b9f02c2f8624bbf0746939e3b2735a1537a567b6 [log] [tgz]
author: Usama Arif <usama.arif@linaro.org> Fri Oct 25 17:37:33 2019 +0100
committer: Hans Boehm <hboehm@google.com> Tue Nov 12 20:58:17 2019 +0000
tree: ac6ad1f4125bd459a3b424cb5ff8b8029a3d5c7e
parent: f1b18facd1edd6c8652c42085c5432c878507c8e [diff]
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index c48aaf5..228255a 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -3240,6 +3240,38 @@
   __ Sxth(out, out);  // sign extend due to returning a short type.
 }
 
+template<typename OP>
+void GenerateFP16Round(HInvoke* invoke,
+                       CodeGeneratorARM64* const codegen_,
+                       MacroAssembler* masm,
+                       const OP roundOp) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  LocationSummary* locations = invoke->GetLocations();
+  UseScratchRegisterScope scratch_scope(masm);
+  Register out = WRegisterFrom(locations->Out());
+  VRegister half = scratch_scope.AcquireH();
+  __ Fmov(half, WRegisterFrom(locations->InAt(0)));
+  roundOp(half, half);
+  __ Fmov(out, half);
+  __ Sxth(out, out);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFP16Floor(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntToIntLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto roundOp = [masm](const VRegister& out, const VRegister& in) {
+    __ Frintm(out, in);  // Round towards Minus infinity
+  };
+  GenerateFP16Round(invoke, codegen_, masm, roundOp);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);

diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 74e861f..8217980 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc

@@ -3072,6 +3072,7 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index b18bbdd..0bab2a0 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc

@@ -2709,6 +2709,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index e4627db..6ed1133 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc

@@ -2359,6 +2359,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 95aa4c0..7bc9b63 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc

@@ -3083,6 +3083,7 @@
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 8dbc0d3..e11208c 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc

@@ -2750,6 +2750,7 @@
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);

diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index 2ef3522..ec24c42 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h

@@ -357,6 +357,7 @@
       case Intrinsics::kVarHandleWeakCompareAndSetRelease:
         return 0u;
       case Intrinsics::kUnsafeGetLong:
+      case Intrinsics::kFP16Floor:
       case Intrinsics::kFP16ToFloat:
       case Intrinsics::kFP16ToHalf:
         return kAccCorePlatformApi;

diff --git a/runtime/image.cc b/runtime/image.cc
index 06ba946..2b4099f 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc

@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '0', '\0' };  // Chained checksums.
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '1', '\0' };  // FP16Floor intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,

diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 3759225..4ddf9bb 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc

@@ -575,6 +575,7 @@
     UNIMPLEMENTED_CASE(CRC32UpdateByteBuffer /* (IJII)I */)
     UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */)
     UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
+    UNIMPLEMENTED_CASE(FP16Floor /* (S)S */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)

diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index bb41ca7..045c808 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h

@@ -165,6 +165,7 @@
   V(MemoryPokeIntNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeIntNative", "(JI)V") \
   V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
+  V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \
   V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
   V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
   V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \

diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index 4aa8d55..815c9f5 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java

@@ -112,9 +112,35 @@
         assertEquals(0xffffe000, TestFP16ToFloatRawIntBits((short)(0xffff)));  // QNaN->QNaN
     }
 
+    public static void testFloor() {
+        // These tests have been taken from the cts HalfTest
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.floor(FP16.POSITIVE_INFINITY));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.floor(FP16.NEGATIVE_INFINITY));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.POSITIVE_ZERO));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.floor(FP16.NEGATIVE_ZERO));
+        assertEquals(FP16.NaN, FP16.floor(FP16.NaN));
+        assertEquals(FP16.LOWEST_VALUE, FP16.floor(FP16.LOWEST_VALUE));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.MIN_NORMAL));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.floor((short) 0x3ff));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.toHalf(0.2f)));
+        assertEquals(-1.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-0.2f))));
+        assertEquals(-1.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-0.7f))));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.toHalf(0.7f)));
+        assertEquals(124.0f, FP16.toFloat(FP16.floor(FP16.toHalf(124.7f))));
+        assertEquals(-125.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-124.7f))));
+        assertEquals(124.0f, FP16.toFloat(FP16.floor(FP16.toHalf(124.2f))));
+        assertEquals(-125.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-124.2f))));
+        // floor for NaN values
+        assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01));
+        assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00));
+        assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01));
+        assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
+    }
+
     public static void main(String args[]) {
         testHalfToFloatToHalfConversions();
         testToHalf();
         testToFloat();
+        testFloor();
     }
 }
commit	b9f02c2f8624bbf0746939e3b2735a1537a567b6	[log] [tgz]
author	Usama Arif <usama.arif@linaro.org>	Fri Oct 25 17:37:33 2019 +0100
committer	Hans Boehm <hboehm@google.com>	Tue Nov 12 20:58:17 2019 +0000
tree	ac6ad1f4125bd459a3b424cb5ff8b8029a3d5c7e
parent	f1b18facd1edd6c8652c42085c5432c878507c8e [diff]