diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
index 163046e..65f6cb1 100644
--- a/lib/CodeGen/CGBlocks.cpp
+++ b/lib/CodeGen/CGBlocks.cpp
@@ -109,13 +109,14 @@
 
 // FIXME: Push most into CGM, passing down a few bits, like current
 // function name.
-llvm::Constant *CodeGenFunction::BuildBlockLiteralTmp(const BlockExpr *BE) {
+llvm::Value *CodeGenFunction::BuildBlockLiteralTmp(const BlockExpr *BE) {
   bool insideFunction = false;
   bool BlockRefDeclList = false;
   bool BlockByrefDeclList = false;
 
   std::vector<llvm::Constant*> Elts;
   llvm::Constant *C;
+  llvm::Value *V;
 
   {
     // C = BuildBlockStructInitlist();
@@ -152,32 +153,77 @@
       if (ND->getIdentifier())
         Name = ND->getNameAsCString();
     BlockInfo Info(0, Name);
-    uint64_t subBlockSize;
+    uint64_t subBlockSize, subBlockAlign;
+    llvm::SmallVector<ValueDecl *, 8> subBlockDeclRefDecls;
     llvm::Function *Fn
-      = CodeGenFunction(CGM).GenerateBlockFunction(BE, Info, subBlockSize);
+      = CodeGenFunction(CGM).GenerateBlockFunction(BE, Info, subBlockSize,
+                                                   subBlockAlign, subBlockDeclRefDecls);
     Elts.push_back(Fn);
 
     // __descriptor
     Elts.push_back(BuildDescriptorBlockDecl(subBlockSize));
 
-    // FIXME: Add block_original_ref_decl_list and block_byref_decl_list.
+    // FIXME: Also check to make sure there are no byref variables
+    if (subBlockDeclRefDecls.size() == 0) {
+      C = llvm::ConstantStruct::get(Elts);
+
+      char Name[32];
+      sprintf(Name, "__block_holder_tmp_%d", CGM.getGlobalUniqueCount());
+      C = new llvm::GlobalVariable(C->getType(), true,
+                                   llvm::GlobalValue::InternalLinkage,
+                                   C, Name, &CGM.getModule());
+      QualType BPT = BE->getType();
+      C = llvm::ConstantExpr::getBitCast(C, ConvertType(BPT));
+      return C;
+    }
+      
+    std::vector<const llvm::Type *> Types(5+subBlockDeclRefDecls.size());
+    for (int i=0; i<5; ++i)
+      Types[i] = Elts[i]->getType();
+
+    for (unsigned i=0; i < subBlockDeclRefDecls.size(); ++i)
+      Types[i+5] = ConvertType(subBlockDeclRefDecls[i]->getType());
+
+    llvm::Type *Ty = llvm::StructType::get(Types, true);
+
+    llvm::AllocaInst *A = CreateTempAlloca(Ty);
+    A->setAlignment(subBlockAlign);
+    V = A;
+
+    for (unsigned i=0; i<5; ++i)
+      Builder.CreateStore(Elts[i], Builder.CreateStructGEP(V, i, "block.tmp"));
+    
+    for (unsigned i=0; i < subBlockDeclRefDecls.size(); ++i)
+      {
+        ValueDecl *VD = subBlockDeclRefDecls[i];
+
+        if (VD->getIdentifier() == 0)
+            continue;
+        SourceLocation Loc = VD->getLocation();
+        DeclRefExpr D(VD, VD->getType(), Loc);
+        llvm::Value* Addr = Builder.CreateStructGEP(V, i+5, "tmp");
+        RValue r = EmitAnyExpr(&D, Addr, false);
+        if (r.isScalar())
+          Builder.CreateStore(r.getScalarVal(), Addr);
+        else if (r.isComplex())
+          // FIXME: implement
+          ErrorUnsupported(BE, "complex in block literal");
+        else if (r.isAggregate())
+          ; // Already created into the destination
+        else
+          assert (0 && "bad block variable");
+        // FIXME: Ensure that the offset created by the backend for
+        // the struct matches the previously computed offset in BlockDecls.
+      }
+
+    // FIXME: Add block_byref_decl_list.
   }
   
-  C = llvm::ConstantStruct::get(Elts);
-
-  char Name[32];
-  sprintf(Name, "__block_holder_tmp_%d", CGM.getGlobalUniqueCount());
-  C = new llvm::GlobalVariable(C->getType(), true,
-                               llvm::GlobalValue::InternalLinkage,
-                               C, Name, &CGM.getModule());
   QualType BPT = BE->getType();
-  C = llvm::ConstantExpr::getBitCast(C, ConvertType(BPT));
-  return C;
+  return Builder.CreateBitCast(V, ConvertType(BPT));
 }
 
 
-
-
 const llvm::Type *CodeGenModule::getBlockDescriptorType() {
   if (BlockDescriptorType)
     return BlockDescriptorType;
@@ -365,9 +411,12 @@
   llvm::Constant *LiteralFields[5];
 
   CodeGenFunction::BlockInfo Info(0, n);
-  uint64_t subBlockSize;
+  uint64_t subBlockSize, subBlockAlign;
+  llvm::SmallVector<ValueDecl *, 8> subBlockDeclRefDecls;
   llvm::Function *Fn
-    = CodeGenFunction(*this).GenerateBlockFunction(BE, Info, subBlockSize);
+    = CodeGenFunction(*this).GenerateBlockFunction(BE, Info, subBlockSize,
+                                                   subBlockAlign,
+                                                   subBlockDeclRefDecls);
   assert(subBlockSize == BlockLiteralSize
          && "no imports allowed for global block");
 
@@ -404,7 +453,9 @@
 
 llvm::Function *CodeGenFunction::GenerateBlockFunction(const BlockExpr *Expr,
                                                        const BlockInfo& Info,
-                                                       uint64_t &Size) {
+                                                       uint64_t &Size,
+                                                       uint64_t &Align,
+                                                       llvm::SmallVector<ValueDecl *, 8> &subBlockDeclRefDecls) {
   const FunctionTypeProto *FTy =
     cast<FunctionTypeProto>(Expr->getFunctionType());
 
@@ -442,7 +493,13 @@
   EmitStmt(Expr->getBody());
   FinishFunction(cast<CompoundStmt>(Expr->getBody())->getRBracLoc());
 
+  // The runtime needs a minimum alignment of a void *.
+  uint64_t MinAlign = getContext().getTypeAlign(getContext().VoidPtrTy) / 8;
+  BlockOffset = llvm::RoundUpToAlignment(BlockOffset, MinAlign);
+
   Size = BlockOffset;
+  Align = BlockAlign;
+  subBlockDeclRefDecls = BlockDeclRefDecls;
 
   return Fn;
 }
diff --git a/lib/CodeGen/CGExprConstant.cpp b/lib/CodeGen/CGExprConstant.cpp
index 487b274..93bc53f 100644
--- a/lib/CodeGen/CGExprConstant.cpp
+++ b/lib/CodeGen/CGExprConstant.cpp
@@ -452,7 +452,7 @@
     case Expr::BlockExprClass: {
       BlockExpr *B = cast<BlockExpr>(E);
       if (!B->hasBlockDeclRefExprs())
-        return CGF->BuildBlockLiteralTmp(B);
+        return cast<llvm::Constant>(CGF->BuildBlockLiteralTmp(B));
     }
     }
 
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index 4a2fe99..020c21f 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -607,11 +607,7 @@
     return VisitExpr(E);
   }
 
-  // FIXME: We have most of the easy codegen for the helper, but we need to
-  // ensure we don't need copy/dispose, and we need to add the variables into
-  // the block literal still.
-  CGF.ErrorUnsupported(E, "scalar expression");
-
+  // FIXME: ensure we don't need copy/dispose.
   uint64_t &offset = CGF.BlockDecls[E->getDecl()];
 
   const llvm::Type *Ty;
@@ -1389,8 +1385,8 @@
 }
 
 Value *ScalarExprEmitter::VisitBlockExpr(const BlockExpr *BE) {
-  llvm::Constant *C = CGF.BuildBlockLiteralTmp(BE);
-  return C;
+  llvm::Value *V = CGF.BuildBlockLiteralTmp(BE);
+  return V;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index 879b858..5fb0b2b 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -38,6 +38,7 @@
   else
     BlockOffset = CGM.getTargetData()
       .getTypeStoreSizeInBits(CGM.getGenericExtendedBlockLiteralType()) / 8;
+  BlockAlign = getContext().getTypeAlign(getContext().VoidPtrTy) / 8;
 }
 
 ASTContext &CodeGenFunction::getContext() const {
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index 6ee22bb..8b5986c 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -264,7 +264,7 @@
   //                                  Block Bits
   //===--------------------------------------------------------------------===//
 
-  llvm::Constant *BuildBlockLiteralTmp(const BlockExpr *);
+  llvm::Value *BuildBlockLiteralTmp(const BlockExpr *);
   llvm::Constant *BuildDescriptorBlockDecl(uint64_t Size);
 
   /// BlockInfo - Information to generate a block literal.
@@ -281,7 +281,8 @@
 
   llvm::Function *GenerateBlockFunction(const BlockExpr *Expr,
                                         const BlockInfo& Info,
-                                        uint64_t &Size);
+                                        uint64_t &Size, uint64_t &Align,
+                                        llvm::SmallVector<ValueDecl *, 8> &subBlockDeclRefDecls);
 
   ImplicitParamDecl *BlockStructDecl;
 
@@ -292,21 +293,40 @@
   /// BlockHasCopyDispose - True iff the block uses copy/dispose.
   bool BlockHasCopyDispose;
 
+  /// BlockDeclRefDecls - Decls from BlockDeclRefExprs in apperance order
+  /// in a block literal.  Decls without names are used for padding.
+  llvm::SmallVector<ValueDecl *, 8> BlockDeclRefDecls;
+
+  /// BlockOffset - The offset in bytes for the next allocation of an
+  /// imported block variable.
   uint64_t BlockOffset;
-  /// getBlockOffset - Offset for next allocated variable use in a BlockExpr.
-  uint64_t getBlockOffset(uint64_t Size, uint64_t Align) {
-    assert (((Align >> 3) > 0) && "alignment must be 1 byte or more");
-    assert (((Align & 7) == 0)
-            && "alignment must be on at least byte boundaries");
-    // Ensure proper alignment, even if it means we have to have a gap
-    BlockOffset = llvm::RoundUpToAlignment(BlockOffset, Align/8);
-      
-    BlockOffset += Size;
-    return BlockOffset-Size;
-  }
+  /// BlockAlign - Maximal alignment needed for the Block expressed in bytes.
+  uint64_t BlockAlign;
+  /// getBlockOffset - Offset for next allocated variable used in a BlockExpr.
   uint64_t getBlockOffset(ValueDecl *D) {
     uint64_t Size = getContext().getTypeSize(D->getType()) / 8;
-    return getBlockOffset(Size, getContext().getDeclAlignInBytes(D)*8);
+    uint64_t Align = getContext().getDeclAlignInBytes(D);
+
+    assert ((Align > 0) && "alignment must be 1 byte or more");
+
+    uint64_t OldOffset = BlockOffset;
+
+    // Ensure proper alignment, even if it means we have to have a gap
+    BlockOffset = llvm::RoundUpToAlignment(BlockOffset, Align);
+    BlockAlign = std::max(Align, BlockAlign);
+      
+    uint64_t Pad = BlockOffset - OldOffset;
+    llvm::ArrayType::get(llvm::Type::Int8Ty, Pad);
+    QualType PadTy = getContext().getConstantArrayType(getContext().CharTy,
+                                                       llvm::APInt(32, Pad),
+                                                       ArrayType::Normal, 0);
+    ValueDecl *PadDecl = VarDecl::Create(getContext(), 0, SourceLocation(),
+                                         0, QualType(PadTy), VarDecl::None, SourceLocation());
+    BlockDeclRefDecls.push_back(PadDecl);
+    BlockDeclRefDecls.push_back(D);
+
+    BlockOffset += Size;
+    return BlockOffset-Size;
   }
   std::map<Decl*, uint64_t> BlockDecls;
 
