ARM64: FP16.ceil() intrinsic for ARMv8

This CL implements an intrinsic for ceil() method with
ARMv8.2 FP16 instructions.

This intrinsic implementation achieves bit-level compatibility with the
original Java implementation android.util.Half.ceil().

The time required in milliseconds to execute the below code on Pixel3:
- Java implementation android.util.Half.ceil():
    - big cluster only: 19447
    - little cluster only: 62638
- arm64 Intrinisic implementation:
    - big cluster only: 14260 (~27% faster)
    - little cluster only: 54387 (~13% faster)

Analysis of this function with simpleperf showed that approximately only
60-65% of the time is spent in libcore.util.FP16.ceil. So the percentage
improvement using intrinsics is likely to be more than the numbers stated
above.

Another reason that the performance improvement with intrinsic is lower
than expected is because the java implementation for values between -1 and
1 (abs < 0x3c00) only requires a few instructions and should almost give
a similar performance to the intrinsic in this case. In the benchmark function
below, 46.8% of the values tested are between -1 and 1.

public static short benchmarkCeil(){
    short ret = 0;
    long before = 0;
    long after = 0;
    before = System.currentTimeMillis();
    for(int i = 0; i < 50000; i++){
        for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) {
            ret += FP16.ceil(h);
        }
    }
    after = System.currentTimeMillis();
    System.out.println("Time of FP16.ceil (ms): " + (after - before));
    System.out.println(ret);
    return ret;
}

Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Change-Id: I5474c1d0d7c08ec77a6f82c4fb67f555253bfa67
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 228255a..0859596 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3272,6 +3272,22 @@
   GenerateFP16Round(invoke, codegen_, masm, roundOp);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16Ceil(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntToIntLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto roundOp = [masm](const FPRegister& out, const FPRegister& in) {
+    __ Frintp(out, in);  // Round towards Plus infinity
+  };
+  GenerateFP16Round(invoke, codegen_, masm, roundOp);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 8217980..77dcbfb 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3073,6 +3073,7 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 0bab2a0..fc06691 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2710,6 +2710,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 6ed1133..8a6e94c 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2360,6 +2360,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 7bc9b63..e10214b 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3084,6 +3084,7 @@
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index e11208c..d8ccd9b 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2751,6 +2751,7 @@
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index ec24c42..152a725 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -357,6 +357,7 @@
       case Intrinsics::kVarHandleWeakCompareAndSetRelease:
         return 0u;
       case Intrinsics::kUnsafeGetLong:
+      case Intrinsics::kFP16Ceil:
       case Intrinsics::kFP16Floor:
       case Intrinsics::kFP16ToFloat:
       case Intrinsics::kFP16ToHalf:
diff --git a/runtime/image.cc b/runtime/image.cc
index 2b4099f..171547b 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '1', '\0' };  // FP16Floor intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '2', '\0' };  // FP16Ceil intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 4ddf9bb..f3ef257 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -576,6 +576,7 @@
     UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */)
     UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
     UNIMPLEMENTED_CASE(FP16Floor /* (S)S */)
+    UNIMPLEMENTED_CASE(FP16Ceil /* (S)S */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 045c808..ee91066 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -165,6 +165,7 @@
   V(MemoryPokeIntNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeIntNative", "(JI)V") \
   V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
+  V(FP16Ceil, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "ceil", "(S)S") \
   V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \
   V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
   V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index 815c9f5..de9deda 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java
@@ -137,10 +137,36 @@
         assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
     }
 
+    public static void testCeil() {
+        // These tests have been taken from the cts HalfTest
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.ceil(FP16.POSITIVE_INFINITY));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.ceil(FP16.NEGATIVE_INFINITY));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.ceil(FP16.POSITIVE_ZERO));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.NEGATIVE_ZERO));
+        assertEquals(FP16.NaN, FP16.ceil(FP16.NaN));
+        assertEquals(FP16.LOWEST_VALUE, FP16.ceil(FP16.LOWEST_VALUE));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.MIN_NORMAL)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil((short) 0x3ff)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(0.2f))));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.toHalf(-0.2f)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(0.7f))));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.toHalf(-0.7f)));
+        assertEquals(125.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(124.7f))));
+        assertEquals(-124.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(-124.7f))));
+        assertEquals(125.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(124.2f))));
+        assertEquals(-124.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(-124.2f))));
+        // ceil for NaN values
+        assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01));
+        assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00));
+        assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01));
+        assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
+    }
+
     public static void main(String args[]) {
         testHalfToFloatToHalfConversions();
         testToHalf();
         testToFloat();
         testFloor();
+        testCeil();
     }
 }