[ARM][AArch64] Implement __cls,  __clsl and __clsll intrinsics from ACLE

Summary:
Writing support for three ACLE functions:
  unsigned int __cls(uint32_t x)
  unsigned int __clsl(unsigned long x)
  unsigned int __clsll(uint64_t x)

CLS stands for "Count number of leading sign bits".

In AArch64, these two intrinsics can be translated into the 'cls'
instruction directly. In AArch32, on the other hand, this functionality
is achieved by implementing it in terms of clz (count number of leading
zeros).

Reviewers: compnerd

Reviewed By: compnerd

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D69250
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 4df8d5a..f07c567 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -33,6 +33,8 @@
 // Bit manipulation
 BUILTIN(__builtin_arm_rbit, "UiUi", "nc")
 BUILTIN(__builtin_arm_rbit64, "WUiWUi", "nc")
+BUILTIN(__builtin_arm_cls, "UiZUi", "nc")
+BUILTIN(__builtin_arm_cls64, "UiWUi", "nc")
 
 // HINT
 BUILTIN(__builtin_arm_nop, "v", "")
diff --git a/clang/include/clang/Basic/BuiltinsARM.def b/clang/include/clang/Basic/BuiltinsARM.def
index 5f8308d..81991a5 100644
--- a/clang/include/clang/Basic/BuiltinsARM.def
+++ b/clang/include/clang/Basic/BuiltinsARM.def
@@ -115,6 +115,8 @@
 
 // Bit manipulation
 BUILTIN(__builtin_arm_rbit, "UiUi", "nc")
+BUILTIN(__builtin_arm_cls, "UiZUi", "nc")
+BUILTIN(__builtin_arm_cls64, "UiWUi", "nc")
 
 // Store and load exclusive
 BUILTIN(__builtin_arm_ldrexd, "LLUiv*", "")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9a56116..648837a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6055,6 +6055,16 @@
         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
   }
 
+  if (BuiltinID == ARM::BI__builtin_arm_cls) {
+    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
+  }
+  if (BuiltinID == ARM::BI__builtin_arm_cls64) {
+    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
+                              "cls");
+  }
+
   if (BuiltinID == ARM::BI__clear_cache) {
     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
     const FunctionDecl *FD = E->getDirectCallee();
@@ -7108,6 +7118,17 @@
         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
   }
 
+  if (BuiltinID == AArch64::BI__builtin_arm_cls) {
+    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
+                              "cls");
+  }
+  if (BuiltinID == AArch64::BI__builtin_arm_cls64) {
+    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
+                              "cls");
+  }
+
   if (BuiltinID == AArch64::BI__builtin_arm_jcvt) {
     assert((getContext().getTypeSize(E->getType()) == 32) &&
            "__jcvt of unusual size!");
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 942172f..137092f 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -139,6 +139,26 @@
   return __builtin_clzll(__t);
 }
 
+/* CLS */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__cls(uint32_t __t) {
+  return __builtin_arm_cls(__t);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__clsl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_arm_cls(__t);
+#else
+  return __builtin_arm_cls64(__t);
+#endif
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__clsll(uint64_t __t) {
+  return __builtin_arm_cls64(__t);
+}
+
 /* REV */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __rev(uint32_t __t) {
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 7463d0d..9f0ad22 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -175,6 +175,24 @@
   return __clzll(t);
 }
 
+// ARM-LABEL: test_cls
+// ARM: call i32 @llvm.arm.cls(i32 %t)
+unsigned test_cls(uint32_t t) {
+  return __cls(t);
+}
+
+// ARM-LABEL: test_clsl
+// AArch32: call i32 @llvm.arm.cls(i32 %t)
+// AArch64: call i32 @llvm.arm.cls64(i64 %t)
+unsigned test_clsl(unsigned long t) {
+  return __clsl(t);
+}
+// ARM-LABEL: test_clsll
+// ARM: call i32 @llvm.arm.cls64(i64 %t)
+unsigned test_clsll(uint64_t t) {
+  return __clsll(t);
+}
+
 // ARM-LABEL: test_rev
 // ARM: call i32 @llvm.bswap.i32(i32 %t)
 uint32_t test_rev(uint32_t t) {
diff --git a/clang/test/CodeGen/builtins-arm.c b/clang/test/CodeGen/builtins-arm.c
index 4941411..f3c4eca 100644
--- a/clang/test/CodeGen/builtins-arm.c
+++ b/clang/test/CodeGen/builtins-arm.c
@@ -256,6 +256,21 @@
   __builtin_arm_wsrp("sysreg", v);
 }
 
+unsigned int cls(uint32_t v) {
+  // CHECK: call i32 @llvm.arm.cls(i32 %v)
+  return __builtin_arm_cls(v);
+}
+
+unsigned int clsl(unsigned long v) {
+  // CHECK: call i32 @llvm.arm.cls(i32 %v)
+  return __builtin_arm_cls(v);
+}
+
+unsigned int clsll(uint64_t v) {
+  // CHECK: call i32 @llvm.arm.cls64(i64 %v)
+  return __builtin_arm_cls64(v);
+}
+
 // CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"}
 // CHECK: ![[M1]] = !{!"cp1:2:c3"}
 // CHECK: ![[M2]] = !{!"sysreg"}
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
index 7095396..f16cd45 100644
--- a/clang/test/CodeGen/builtins-arm64.c
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -106,4 +106,21 @@
   __builtin_arm_wsrp("1:2:3:4:5", v);
 }
 
+unsigned int cls(uint32_t v) {
+  // CHECK: call i32 @llvm.aarch64.cls(i32 %v)
+  return __builtin_arm_cls(v);
+}
+
+unsigned int clsl(unsigned long v) {
+  // CHECK-WIN: [[V64:%[^ ]+]] = zext i32 %v to i64
+  // CHECK-WIN: call i32 @llvm.aarch64.cls64(i64 [[V64]]
+  // CHECK-LINUX: call i32 @llvm.aarch64.cls64(i64 %v)
+  return __builtin_arm_cls64(v);
+}
+
+unsigned int clsll(uint64_t v) {
+  // CHECK: call i32 @llvm.aarch64.cls64(i64 %v)
+  return __builtin_arm_cls64(v);
+}
+
 // CHECK: ![[M0]] = !{!"1:2:3:4:5"}