ARM64: FP16 greater/less/greaterEquals/lessEquals intrinsics for ARMv8

This CL implements intrinsics for greater, greaterEquals, less,
lessEquals methods with ARMv8.2 FP16 instructions. This requires the
ARMv8.2 AArch64 asimd half precision extension.

The time required in milliseconds to execute the below code for the four
intrinsics on Pixel3 is (The code below is for FP16.less but is similar
for the rest of the intrinsics):

- Java implementation libcore.util.FP16.less():
    - big cluster only: 19876
    - little cluster only: 47525
- arm64 Intrinisic implementationi for less:
    - big cluster only: 14526 (~27% faster)
    - little cluster only: 45815 (~4% faster)

- Java implementation libcore.util.FP16.lessEquals():
    - big cluster only: 19856
    - little cluster only: 47419
- arm64 Intrinisic implementation for lessEquals:
    - big cluster only: 14469 (~27% faster)
    - little cluster only: 45762 (~4% faster)

- Java implementation libcore.util.FP16.greater():
    - big cluster only: 19854
    - little cluster only: 47623
- arm64 Intrinisic implementation for greater:
    - big cluster only: 14519 (~27% faster)
    - little cluster only: 45722 (~4% faster)

- Java implementation libcore.util.FP16.greaterEquals():
    - big cluster only: 19865
    - little cluster only: 47216
- arm64 Intrinisic implementation for greaterEquals:
    - big cluster only: 14485 (~27% faster)
    - little cluster only: 45729 (~4% faster)

public static boolean benchmarkComparison(){
    boolean ret = false;
    long before = 0;
    long after = 0;
    before = System.currentTimeMillis();
    for(long i = 0; i < 1e9; i++){
        // FP16.toHalf(12.3) = 0x4a26, FP16.toHalf(12.4) = 0x4a33
        // FP16.toHalf(-12.3) = 0xca26, FP16.toHalf(-12.4) = 0xca33
        ret |= FP16.less((short) 0x4a26,(short) 0x4a33);
        ret |= FP16.less((short) 0x4a33,(short) 0x4a26);
        ret |= FP16.less((short) 0xca26,(short) 0xca33);
        ret |= FP16.less((short) 0xca33,(short) 0xca26);
    }
    after = System.currentTimeMillis();
    System.out.println("Time of FP16.less (ms): " + (after - before));
    System.out.println(ret);
    return ret;
}

Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Change-Id: Id1a2c3e7328c82c798fcaf1fa74f5908a822cd0b
diff --git a/runtime/image.cc b/runtime/image.cc
index 2566f80..07fcc8b 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '3', '\0' };  // FP16Rint intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '4', '\0' };  // FP16 gt/ge/lt/le intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,