Fix the representation of wide strings in the AST and IR so that it uses the native representation of integers for the elements.  This fixes a bunch of nastiness involving
treating wide strings as a series of bytes.

Patch by Seth Cantrell.



git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@143417 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/CGExprConstant.cpp b/lib/CodeGen/CGExprConstant.cpp
index 0622c10..889cdd8 100644
--- a/lib/CodeGen/CGExprConstant.cpp
+++ b/lib/CodeGen/CGExprConstant.cpp
@@ -817,13 +817,7 @@
   }
 
   llvm::Constant *VisitStringLiteral(StringLiteral *E) {
-    assert(!E->getType()->isPointerType() && "Strings are always arrays");
-
-    // This must be a string initializing an array in a static initializer.
-    // Don't emit it as the address of the string, emit the string data itself
-    // as an inline array.
-    return llvm::ConstantArray::get(VMContext,
-                                    CGM.GetStringForStringLiteral(E), false);
+    return CGM.GetConstantArrayFromStringLiteral(E);
   }
 
   llvm::Constant *VisitObjCEncodeExpr(ObjCEncodeExpr *E) {
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index c796e0d..0905c4b 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -2037,6 +2037,8 @@
 /// GetStringForStringLiteral - Return the appropriate bytes for a
 /// string literal, properly padded to match the literal type.
 std::string CodeGenModule::GetStringForStringLiteral(const StringLiteral *E) {
+  assert((E->isAscii() || E->isUTF8())
+         && "Use GetConstantArrayFromStringLiteral for wide strings");
   const ASTContext &Context = getContext();
   const ConstantArrayType *CAT =
     Context.getAsConstantArrayType(E->getType());
@@ -2045,27 +2047,44 @@
   // Resize the string to the right size.
   uint64_t RealLen = CAT->getSize().getZExtValue();
 
-  switch (E->getKind()) {
-  case StringLiteral::Ascii:
-  case StringLiteral::UTF8:
-    break;
-  case StringLiteral::Wide:
-    RealLen *= Context.getTargetInfo().getWCharWidth() / Context.getCharWidth();
-    break;
-  case StringLiteral::UTF16:
-    RealLen *= Context.getTargetInfo().getChar16Width() / Context.getCharWidth();
-    break;
-  case StringLiteral::UTF32:
-    RealLen *= Context.getTargetInfo().getChar32Width() / Context.getCharWidth();
-    break;
-  }
-
   std::string Str = E->getString().str();
   Str.resize(RealLen, '\0');
 
   return Str;
 }
 
+llvm::Constant *
+CodeGenModule::GetConstantArrayFromStringLiteral(const StringLiteral *E) {
+  assert(!E->getType()->isPointerType() && "Strings are always arrays");
+  
+  // Don't emit it as the address of the string, emit the string data itself
+  // as an inline array.
+  if (E->getCharByteWidth()==1) {
+    return llvm::ConstantArray::get(VMContext,
+                                    GetStringForStringLiteral(E), false);
+  } else {
+    llvm::ArrayType *AType =
+      cast<llvm::ArrayType>(getTypes().ConvertType(E->getType()));
+    llvm::Type *ElemTy = AType->getElementType();
+    unsigned NumElements = AType->getNumElements();
+    std::vector<llvm::Constant*> Elts;
+    Elts.reserve(NumElements);
+    
+    for(unsigned i=0;i<E->getLength();++i) {
+      unsigned value = E->getCodeUnit(i);
+      llvm::Constant *C = llvm::ConstantInt::get(ElemTy,value,false);
+      Elts.push_back(C);
+    }
+    for(unsigned i=E->getLength();i<NumElements;++i) {
+      llvm::Constant *C = llvm::ConstantInt::get(ElemTy,0,false);
+      Elts.push_back(C);
+    }
+    
+    return llvm::ConstantArray::get(AType, Elts);
+  }
+
+}
+
 /// GetAddrOfConstantStringFromLiteral - Return a pointer to a
 /// constant array for the given string literal.
 llvm::Constant *
@@ -2073,15 +2092,23 @@
   // FIXME: This can be more efficient.
   // FIXME: We shouldn't need to bitcast the constant in the wide string case.
   CharUnits Align = getContext().getTypeAlignInChars(S->getType());
-  llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S),
-                                              /* GlobalName */ 0,
-                                              Align.getQuantity());
-  if (S->isWide() || S->isUTF16() || S->isUTF32()) {
-    llvm::Type *DestTy =
-        llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType()));
-    C = llvm::ConstantExpr::getBitCast(C, DestTy);
+  if (S->isAscii() || S->isUTF8()) {
+    return GetAddrOfConstantString(GetStringForStringLiteral(S),
+                                   /* GlobalName */ 0,
+                                   Align.getQuantity());
   }
-  return C;
+
+  // FIXME: the following does not memoize wide strings
+  llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
+  llvm::GlobalVariable *GV =
+    new llvm::GlobalVariable(getModule(),C->getType(),
+                             !Features.WritableStrings,
+                             llvm::GlobalValue::PrivateLinkage,
+                             C,".str");
+  GV->setAlignment(Align.getQuantity());
+  GV->setUnnamedAddr(true);
+  
+  return GV;
 }
 
 /// GetAddrOfConstantStringFromObjCEncode - Return a pointer to a constant
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index ea2e177..0ce698a 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -565,6 +565,10 @@
   /// -fconstant-string-class=class_name option.
   llvm::Constant *GetAddrOfConstantString(const StringLiteral *Literal);
 
+  /// GetConstantArrayFromStringLiteral - Return a constant array for the given
+  /// string.
+  llvm::Constant *GetConstantArrayFromStringLiteral(const StringLiteral *E);
+
   /// GetAddrOfConstantStringFromLiteral - Return a pointer to a constant array
   /// for the given string literal.
   llvm::Constant *GetAddrOfConstantStringFromLiteral(const StringLiteral *S);