Introduce __builtin_nontemporal_store and __builtin_nontemporal_load.

Summary:
Currently clang provides no general way to generate nontemporal loads/stores.
There are some architecture specific builtins for doing so (e.g. in x86), but
there is no way to generate non-temporal store on, e.g. AArch64. This patch adds
generic builtins which are expanded to a simple store with '!nontemporal'
attribute in IR.

Differential Revision: http://reviews.llvm.org/D12313

llvm-svn: 247104
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c35f25a..729c0a1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -111,6 +111,28 @@
   return EmitFromInt(CGF, Result, T, ValueType);
 }
 
+static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Val = CGF.EmitScalarExpr(E->getArg(0));
+  Value *Address = CGF.EmitScalarExpr(E->getArg(1));
+
+  // Convert the type of the pointer to a pointer to the stored type.
+  Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
+  Value *BC = CGF.Builder.CreateBitCast(
+      Address, llvm::PointerType::getUnqual(Val->getType()), "cast");
+  LValue LV = CGF.MakeNaturalAlignAddrLValue(BC, E->getArg(0)->getType());
+  LV.setNontemporal(true);
+  CGF.EmitStoreOfScalar(Val, LV, false);
+  return nullptr;
+}
+
+static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Address = CGF.EmitScalarExpr(E->getArg(0));
+
+  LValue LV = CGF.MakeNaturalAlignAddrLValue(Address, E->getType());
+  LV.setNontemporal(true);
+  return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
+}
+
 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
                                llvm::AtomicRMWInst::BinOp Kind,
                                const CallExpr *E) {
@@ -1143,6 +1165,10 @@
     return RValue::get(nullptr);
   }
 
+  case Builtin::BI__builtin_nontemporal_load:
+    return RValue::get(EmitNontemporalLoad(*this, E));
+  case Builtin::BI__builtin_nontemporal_store:
+    return RValue::get(EmitNontemporalStore(*this, E));
   case Builtin::BI__c11_atomic_is_lock_free:
   case Builtin::BI__atomic_is_lock_free: {
     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6635e57..b7d1ef1 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1160,7 +1160,8 @@
   return EmitLoadOfScalar(lvalue.getAddress(), lvalue.isVolatile(),
                           lvalue.getType(), Loc, lvalue.getAlignmentSource(),
                           lvalue.getTBAAInfo(),
-                          lvalue.getTBAABaseType(), lvalue.getTBAAOffset());
+                          lvalue.getTBAABaseType(), lvalue.getTBAAOffset(),
+                          lvalue.isNontemporal());
 }
 
 static bool hasBooleanRepresentation(QualType Ty) {
@@ -1226,7 +1227,8 @@
                                                AlignmentSource AlignSource,
                                                llvm::MDNode *TBAAInfo,
                                                QualType TBAABaseType,
-                                               uint64_t TBAAOffset) {
+                                               uint64_t TBAAOffset,
+                                               bool isNontemporal) {
   // For better performance, handle vector loads differently.
   if (Ty->isVectorType()) {
     const llvm::Type *EltTy = Addr.getElementType();
@@ -1258,6 +1260,11 @@
   }
 
   llvm::LoadInst *Load = Builder.CreateLoad(Addr, Volatile);
+  if (isNontemporal) {
+    llvm::MDNode *Node = llvm::MDNode::get(
+        Load->getContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+    Load->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
+  }
   if (TBAAInfo) {
     llvm::MDNode *TBAAPath = CGM.getTBAAStructTagInfo(TBAABaseType, TBAAInfo,
                                                       TBAAOffset);
@@ -1330,7 +1337,8 @@
                                         AlignmentSource AlignSource,
                                         llvm::MDNode *TBAAInfo,
                                         bool isInit, QualType TBAABaseType,
-                                        uint64_t TBAAOffset) {
+                                        uint64_t TBAAOffset,
+                                        bool isNontemporal) {
 
   // Handle vectors differently to get better performance.
   if (Ty->isVectorType()) {
@@ -1365,6 +1373,12 @@
   }
 
   llvm::StoreInst *Store = Builder.CreateStore(Value, Addr, Volatile);
+  if (isNontemporal) {
+    llvm::MDNode *Node =
+        llvm::MDNode::get(Store->getContext(),
+                          llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+    Store->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
+  }
   if (TBAAInfo) {
     llvm::MDNode *TBAAPath = CGM.getTBAAStructTagInfo(TBAABaseType, TBAAInfo,
                                                       TBAAOffset);
@@ -1378,7 +1392,7 @@
   EmitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
                     lvalue.getType(), lvalue.getAlignmentSource(),
                     lvalue.getTBAAInfo(), isInit, lvalue.getTBAABaseType(),
-                    lvalue.getTBAAOffset());
+                    lvalue.getTBAAOffset(), lvalue.isNontemporal());
 }
 
 /// EmitLoadOfLValue - Given an expression that represents a value lvalue, this
diff --git a/clang/lib/CodeGen/CGValue.h b/clang/lib/CodeGen/CGValue.h
index 195571b..3ccc4cd 100644
--- a/clang/lib/CodeGen/CGValue.h
+++ b/clang/lib/CodeGen/CGValue.h
@@ -202,6 +202,10 @@
 
   unsigned AlignSource : 2;
 
+  // This flag shows if a nontemporal load/stores should be used when accessing
+  // this lvalue.
+  bool Nontemporal : 1;
+
   Expr *BaseIvarExp;
 
   /// Used by struct-path-aware TBAA.
@@ -228,6 +232,7 @@
     // Initialize Objective-C flags.
     this->Ivar = this->ObjIsArray = this->NonGC = this->GlobalObjCRef = false;
     this->ImpreciseLifetime = false;
+    this->Nontemporal = false;
     this->ThreadLocalRef = false;
     this->BaseIvarExp = nullptr;
 
@@ -277,6 +282,8 @@
   void setARCPreciseLifetime(ARCPreciseLifetime_t value) {
     ImpreciseLifetime = (value == ARCImpreciseLifetime);
   }
+  bool isNontemporal() const { return Nontemporal; }
+  void setNontemporal(bool Value) { Nontemporal = Value; }
 
   bool isObjCWeak() const {
     return Quals.getObjCGCAttr() == Qualifiers::Weak;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 86154e9..3925932 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2449,7 +2449,8 @@
                                   AlignmentSource::Type,
                                 llvm::MDNode *TBAAInfo = nullptr,
                                 QualType TBAABaseTy = QualType(),
-                                uint64_t TBAAOffset = 0);
+                                uint64_t TBAAOffset = 0,
+                                bool isNontemporal = false);
 
   /// EmitLoadOfScalar - Load a scalar value from an address, taking
   /// care to appropriately convert from the memory representation to
@@ -2465,7 +2466,7 @@
                          AlignmentSource AlignSource = AlignmentSource::Type,
                          llvm::MDNode *TBAAInfo = nullptr, bool isInit = false,
                          QualType TBAABaseTy = QualType(),
-                         uint64_t TBAAOffset = 0);
+                         uint64_t TBAAOffset = 0, bool isNontemporal = false);
 
   /// EmitStoreOfScalar - Store a scalar value to an address, taking
   /// care to appropriately convert from the memory representation to